From aacc3b7e259c52e2f8cd078c6915bace4afc32c1 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Sat, 21 Mar 2015 18:38:19 +0900 Subject: [PATCH 001/141] TAJO-1439: Some method name is written wrongly. (Contributed by Jongyoung Park. Committed by jihoon) --- CHANGES | 3 +++ .../src/main/java/org/apache/tajo/master/QueryInProgress.java | 2 +- .../src/main/java/org/apache/tajo/master/QueryManager.java | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGES b/CHANGES index 6854b4f2c9..1786ea56e3 100644 --- a/CHANGES +++ b/CHANGES @@ -48,6 +48,9 @@ Release 0.11.0 - unreleased TASKS + TAJO-1439: Some method name is written wrongly. + (Contributed by Jongyoung Park. Committed by jihoon) + TAJO-1380: Update JDBC documentation for new JDBC driver. (Contributed by Dongjoon Hyun, Committed by hyunsik) diff --git a/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java b/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java index bfc7b9f4c5..c24dd90ada 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java @@ -156,7 +156,7 @@ private void connectQueryMaster() throws Exception { queryMasterRpcClient = queryMasterRpc.getStub(); } - public void submmitQueryToMaster() { + public void submitQueryToMaster() { if(querySubmitted.get()) { return; } diff --git a/tajo-core/src/main/java/org/apache/tajo/master/QueryManager.java b/tajo-core/src/main/java/org/apache/tajo/master/QueryManager.java index 3c81540933..b1fa17dd32 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/QueryManager.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/QueryManager.java @@ -221,7 +221,7 @@ public void handle(QueryJobEvent event) { } if (event.getType() == QueryJobEvent.Type.QUERY_MASTER_START) { - queryInProgress.submmitQueryToMaster(); + queryInProgress.submitQueryToMaster(); } else if (event.getType() == QueryJobEvent.Type.QUERY_JOB_KILL) { scheduler.removeQuery(queryInProgress.getQueryId()); From e563db223ec6a00dfc9ccf386adb6995a8e65fac Mon Sep 17 00:00:00 2001 From: "navis.ryu" Date: Thu, 12 Mar 2015 22:15:49 +0900 Subject: [PATCH 002/141] TAJO-1396 Unexpected IllegalMonitorStateException can be thrown in QueryInProgress Closes #416 Signed-off-by: Jinho Kim --- CHANGES | 3 +++ .../apache/tajo/master/QueryInProgress.java | 22 ++++++++++++++----- .../org/apache/tajo/master/QueryManager.java | 8 +------ 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/CHANGES b/CHANGES index 1786ea56e3..57243004da 100644 --- a/CHANGES +++ b/CHANGES @@ -21,6 +21,9 @@ Release 0.11.0 - unreleased BUG FIXES + TAJO-1396: Unexpected IllegalMonitorStateException can be thrown + in QueryInProgress. (Contributed by navis. Committed by jinho) + TAJO-1384: Duplicated output file path problem. (jihoon) TAJO-1386: CURRENT_DATE generates parsing errors sometimes. diff --git a/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java b/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java index c24dd90ada..668a770a5a 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java @@ -95,7 +95,7 @@ public void kill() { queryMasterRpcClient.killQuery(null, queryId.getProto(), NullCallback.get()); } } catch (Throwable e) { - catchException(e); + catchException("Failed to kill query " + queryId + " by exception " + e, e); } finally { writeLock.unlock(); } @@ -125,6 +125,11 @@ public void stopProgress() { public boolean startQueryMaster() { try { writeLock.lockInterruptibly(); + } catch (Exception e) { + catchException("Failed to lock by exception " + e, e); + return false; + } + try { LOG.info("Initializing QueryInProgress for QueryID=" + queryId); WorkerResourceManager resourceManager = masterContext.getResourceManager(); WorkerAllocatedResource resource = resourceManager.allocateQueryMaster(this); @@ -141,7 +146,7 @@ public boolean startQueryMaster() { return true; } catch (Exception e) { - catchException(e); + catchException("Failed to start query master for query " + queryId + " by exception " + e, e); return false; } finally { writeLock.unlock(); @@ -163,12 +168,17 @@ public void submitQueryToMaster() { try { writeLock.lockInterruptibly(); + } catch (Exception e) { + LOG.error("Failed to lock by exception " + e.getMessage(), e); + return; + } + try { if(queryMasterRpcClient == null) { connectQueryMaster(); } if(queryMasterRpcClient == null) { - LOG.info("No QueryMaster conneciton info."); + LOG.info("No QueryMaster connection info."); //TODO wait return; } @@ -186,14 +196,14 @@ public void submitQueryToMaster() { querySubmitted.set(true); getQueryInfo().setQueryState(TajoProtos.QueryState.QUERY_MASTER_LAUNCHED); } catch (Exception e) { - LOG.error(e.getMessage(), e); + LOG.error("Failed to submit query " + queryId + " to master by exception " + e, e); } finally { writeLock.unlock(); } } - public void catchException(Throwable e) { - LOG.error(e.getMessage(), e); + public void catchException(String message, Throwable e) { + LOG.error(message, e); queryInfo.setQueryState(TajoProtos.QueryState.QUERY_FAILED); queryInfo.setLastMessage(StringUtils.stringifyException(e)); } diff --git a/tajo-core/src/main/java/org/apache/tajo/master/QueryManager.java b/tajo-core/src/main/java/org/apache/tajo/master/QueryManager.java index b1fa17dd32..0c8d8cea50 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/QueryManager.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/QueryManager.java @@ -87,7 +87,7 @@ public void serviceInit(Configuration conf) throws Exception { this.scheduler = new SimpleFifoScheduler(this); } catch (Exception e) { - catchException(null, e); + LOG.error("Failed to init service " + getName() + " by exception " + e, e); } super.serviceInit(conf); @@ -304,12 +304,6 @@ public long getExecutedQuerySize() { return executedQuerySize.get(); } - private void catchException(QueryId queryId, Exception e) { - LOG.error(e.getMessage(), e); - QueryInProgress queryInProgress = runningQueries.get(queryId); - queryInProgress.catchException(e); - } - public synchronized QueryCoordinatorProtocol.TajoHeartbeatResponse.ResponseCommand queryHeartbeat( QueryCoordinatorProtocol.TajoHeartbeat queryHeartbeat) { QueryInProgress queryInProgress = getQueryInProgress(new QueryId(queryHeartbeat.getQueryId())); From e854f830f2f9ff67a23c751cbbd7afa48d6f0500 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Wed, 25 Mar 2015 12:51:41 +0900 Subject: [PATCH 003/141] TAJO-1434: Fix supporting version of Hadoop. Closes #443 Signed-off-by: Jinho Kim --- CHANGES | 3 +++ tajo-docs/src/main/sphinx/getting_started.rst | 2 +- tajo-project/pom.xml | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGES b/CHANGES index 57243004da..0abdbd313f 100644 --- a/CHANGES +++ b/CHANGES @@ -21,6 +21,9 @@ Release 0.11.0 - unreleased BUG FIXES + TAJO-1434: Fix supporting version of Hadoop. + (Contributed by Dongjoon Hyun, Committed by jinho) + TAJO-1396: Unexpected IllegalMonitorStateException can be thrown in QueryInProgress. (Contributed by navis. Committed by jinho) diff --git a/tajo-docs/src/main/sphinx/getting_started.rst b/tajo-docs/src/main/sphinx/getting_started.rst index 2e072228f1..eaf6973827 100644 --- a/tajo-docs/src/main/sphinx/getting_started.rst +++ b/tajo-docs/src/main/sphinx/getting_started.rst @@ -8,7 +8,7 @@ In this section, we explain setup of a standalone Tajo instance. It will run aga Prerequisites ====================== - * Hadoop 2.3.0 or higher (up to 2.5.1) + * Hadoop 2.3.0 or higher (up to 2.6.0) * Java 1.6 or 1.7 * Protocol buffer 2.5.0 diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml index 9f1b1abc75..cec93e38db 100644 --- a/tajo-project/pom.xml +++ b/tajo-project/pom.xml @@ -33,7 +33,7 @@ UTF-8 UTF-8 - 2.5.1 + 2.6.0 2.5.0 0.11.0-SNAPSHOT 0.98.7-hadoop2 From a5350257d70366de1297877492111afe89336913 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Tue, 24 Mar 2015 19:05:24 +0900 Subject: [PATCH 004/141] TAJO-1147: Simple query doesn't work in Web UI Signed-off-by: JaeHwa Jung --- CHANGES | 5 ++++- .../java/org/apache/tajo/webapp/QueryExecutorServlet.java | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGES b/CHANGES index 0abdbd313f..2b7b2572ad 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,6 @@ Tajo Change Log -Release 0.11.0 - unreleased +Release 0.10.1 - unreleased NEW FEATURES @@ -21,6 +21,9 @@ Release 0.11.0 - unreleased BUG FIXES + TAJO-1147: Simple query doesn't work in Web UI. + (Contributed by Jongyoung Park. Committed by jaehwa) + TAJO-1434: Fix supporting version of Hadoop. (Contributed by Dongjoon Hyun, Committed by jinho) diff --git a/tajo-core/src/main/java/org/apache/tajo/webapp/QueryExecutorServlet.java b/tajo-core/src/main/java/org/apache/tajo/webapp/QueryExecutorServlet.java index da7981cf7e..f265e50251 100644 --- a/tajo-core/src/main/java/org/apache/tajo/webapp/QueryExecutorServlet.java +++ b/tajo-core/src/main/java/org/apache/tajo/webapp/QueryExecutorServlet.java @@ -483,7 +483,7 @@ private void getQueryResult(QueryId tajoQueryId) { private void MakeResultText(ResultSet res, TableDesc desc) throws SQLException { ResultSetMetaData rsmd = res.getMetaData(); resultRows = desc.getStats() == null ? 0 : desc.getStats().getNumRows(); - if (resultRows == 0) { + if (resultRows <= 0) { resultRows = 1000; } LOG.info("Tajo Query Result: " + desc.getPath() + "\n"); From becf85b0ac83e764e431f4c30706cca0c16d836f Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 27 Mar 2015 18:03:53 +0900 Subject: [PATCH 005/141] TAJO-1440: Some tests fail in parallel test environment in TestKillQuery. Closes #472 Signed-off-by: Jinho Kim --- CHANGES | 3 ++ .../org/apache/tajo/querymaster/Query.java | 5 +-- .../tajo/querymaster/QueryMasterTask.java | 12 ++++-- .../tajo/querymaster/TestKillQuery.java | 40 ++++++++++++++----- 4 files changed, 44 insertions(+), 16 deletions(-) diff --git a/CHANGES b/CHANGES index 2b7b2572ad..d111c8b228 100644 --- a/CHANGES +++ b/CHANGES @@ -20,6 +20,9 @@ Release 0.10.1 - unreleased (Contributed by navis, Committed by hyunsik) BUG FIXES + + TAJO-1440: Some tests fail in parallel test environment in TestKillQuery. + (Contributed by Jongyoung Park. Committed by jinho) TAJO-1147: Simple query doesn't work in Web UI. (Contributed by Jongyoung Park. Committed by jaehwa) diff --git a/tajo-core/src/main/java/org/apache/tajo/querymaster/Query.java b/tajo-core/src/main/java/org/apache/tajo/querymaster/Query.java index c2740e5b8d..1ce15fccef 100644 --- a/tajo-core/src/main/java/org/apache/tajo/querymaster/Query.java +++ b/tajo-core/src/main/java/org/apache/tajo/querymaster/Query.java @@ -393,8 +393,7 @@ public void transition(Query query, QueryEvent queryEvent) { query.getExecutionBlockCursor().nextBlock()); stage.setPriority(query.priority--); query.addStage(stage); - - stage.handle(new StageEvent(stage.getId(), StageEventType.SQ_INIT)); + stage.getEventHandler().handle(new StageEvent(stage.getId(), StageEventType.SQ_INIT)); LOG.debug("Schedule unit plan: \n" + stage.getBlock().getPlan()); } } @@ -630,7 +629,7 @@ private void executeNextBlock(Query query) { Stage nextStage = new Stage(query.context, query.getPlan(), nextBlock); nextStage.setPriority(query.priority--); query.addStage(nextStage); - nextStage.handle(new StageEvent(nextStage.getId(), StageEventType.SQ_INIT)); + nextStage.getEventHandler().handle(new StageEvent(nextStage.getId(), StageEventType.SQ_INIT)); LOG.info("Scheduling Stage:" + nextStage.getId()); if(LOG.isDebugEnabled()) { diff --git a/tajo-core/src/main/java/org/apache/tajo/querymaster/QueryMasterTask.java b/tajo-core/src/main/java/org/apache/tajo/querymaster/QueryMasterTask.java index 0d1924bf75..465fa8494f 100644 --- a/tajo-core/src/main/java/org/apache/tajo/querymaster/QueryMasterTask.java +++ b/tajo-core/src/main/java/org/apache/tajo/querymaster/QueryMasterTask.java @@ -116,7 +116,8 @@ public class QueryMasterTask extends CompositeService { new ArrayList(); public QueryMasterTask(QueryMaster.QueryMasterContext queryMasterContext, - QueryId queryId, Session session, QueryContext queryContext, String jsonExpr) { + QueryId queryId, Session session, QueryContext queryContext, + String jsonExpr, AsyncDispatcher dispatcher) { super(QueryMasterTask.class.getName()); this.queryMasterContext = queryMasterContext; @@ -125,6 +126,13 @@ public QueryMasterTask(QueryMaster.QueryMasterContext queryMasterContext, this.queryContext = queryContext; this.jsonExpr = jsonExpr; this.querySubmitTime = System.currentTimeMillis(); + this.dispatcher = dispatcher; + } + + public QueryMasterTask(QueryMaster.QueryMasterContext queryMasterContext, + QueryId queryId, Session session, QueryContext queryContext, + String jsonExpr) { + this(queryMasterContext, queryId, session, queryContext, jsonExpr, new AsyncDispatcher()); } @Override @@ -144,8 +152,6 @@ public void init(Configuration conf) { throw new UnimplementedException(resourceManagerClassName + " is not supported yet"); } addService(resourceAllocator); - - dispatcher = new AsyncDispatcher(); addService(dispatcher); dispatcher.register(StageEventType.class, new StageEventDispatcher()); diff --git a/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java b/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java index 8fb8e739d6..09be700dbc 100644 --- a/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java +++ b/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java @@ -19,6 +19,8 @@ package org.apache.tajo.querymaster; import com.google.common.collect.Lists; +import org.apache.hadoop.yarn.event.AsyncDispatcher; +import org.apache.hadoop.yarn.event.Event; import org.apache.tajo.*; import org.apache.tajo.algebra.Expr; import org.apache.tajo.benchmark.TPCH; @@ -52,6 +54,8 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; import static org.junit.Assert.*; @@ -106,29 +110,26 @@ public final void testKillQueryFromInitState() throws Exception { GlobalPlanner globalPlanner = new GlobalPlanner(conf, catalog); globalPlanner.build(masterPlan); + CountDownLatch barrier = new CountDownLatch(1); + MockAsyncDispatch dispatch = new MockAsyncDispatch(barrier, StageEventType.SQ_INIT); + QueryMaster qm = cluster.getTajoWorkers().get(0).getWorkerContext().getQueryMaster(); QueryMasterTask queryMasterTask = new QueryMasterTask(qm.getContext(), - queryId, session, defaultContext, expr.toJson()); + queryId, session, defaultContext, expr.toJson(), dispatch); queryMasterTask.init(conf); queryMasterTask.getQueryTaskContext().getDispatcher().start(); queryMasterTask.startQuery(); try{ - cluster.waitForQueryState(queryMasterTask.getQuery(), TajoProtos.QueryState.QUERY_RUNNING, 2); - } finally { - assertEquals(TajoProtos.QueryState.QUERY_RUNNING, queryMasterTask.getQuery().getSynchronizedState()); + barrier.await(5000, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + fail("Query state : " + queryMasterTask.getQuery().getSynchronizedState()); } Stage stage = queryMasterTask.getQuery().getStages().iterator().next(); assertNotNull(stage); - try{ - cluster.waitForStageState(stage, StageState.INITED, 2); - } finally { - assertEquals(StageState.INITED, stage.getSynchronizedState()); - } - // fire kill event Query q = queryMasterTask.getQuery(); q.handle(new QueryEvent(queryId, QueryEventType.KILL)); @@ -223,4 +224,23 @@ public void testKillTask() throws Throwable { assertEquals(TajoProtos.TaskAttemptState.TA_KILLED, task.getStatus()); } } + + static class MockAsyncDispatch extends AsyncDispatcher { + private CountDownLatch latch; + private Enum eventType; + + MockAsyncDispatch(CountDownLatch latch, Enum eventType) { + super(); + this.latch = latch; + this.eventType = eventType; + } + + @Override + protected void dispatch(Event event) { + if (event.getType() == eventType) { + latch.countDown(); + } + super.dispatch(event); + } + } } From 8b00e41f56eea0ef54011cfab822506d10b34c14 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sun, 29 Mar 2015 21:47:07 +0900 Subject: [PATCH 006/141] TAJO-1437: Resolve findbug warnings on Tajo JDBC Module. Closes #447 Signed-off-by: Jihoon Son --- CHANGES | 3 +++ .../tajo/jdbc/TajoDatabaseMetaData.java | 3 ++- .../tajo/jdbc/TajoPreparedStatement.java | 4 ++-- .../org/apache/tajo/jdbc/TajoStatement.java | 23 +++++++++++-------- 4 files changed, 21 insertions(+), 12 deletions(-) diff --git a/CHANGES b/CHANGES index d111c8b228..56cb020456 100644 --- a/CHANGES +++ b/CHANGES @@ -20,6 +20,9 @@ Release 0.10.1 - unreleased (Contributed by navis, Committed by hyunsik) BUG FIXES + + TAJO-1437: Resolve findbug warnings on Tajo JDBC Module. + (Contributed by Dongjoon Hyun, Committed by jihoon) TAJO-1440: Some tests fail in parallel test environment in TestKillQuery. (Contributed by Jongyoung Park. Committed by jinho) diff --git a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoDatabaseMetaData.java b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoDatabaseMetaData.java index 150e9bf69a..2368082bef 100644 --- a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoDatabaseMetaData.java +++ b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoDatabaseMetaData.java @@ -46,6 +46,7 @@ public class TajoDatabaseMetaData implements DatabaseMetaData { "abs,acos,asin,atan,atan2,ceiling,cos,degrees,exp,,floor,mod,pi,pow," + "radians,round,sign,sin,sqrt,tan"; private static final String STRING_FUNCTIONS = "ascii,chr,concat,left,length,ltrim,repeat,rtrim,substring"; + private static final String PROCEDURE_TERM = "UDF"; private final JdbcConnection conn; @@ -157,7 +158,7 @@ public String getSchemaTerm() throws SQLException { @Override public String getProcedureTerm() throws SQLException { - return new String("UDF"); + return PROCEDURE_TERM; } @Override diff --git a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoPreparedStatement.java b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoPreparedStatement.java index fa3df98400..229587a2d6 100644 --- a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoPreparedStatement.java +++ b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoPreparedStatement.java @@ -86,8 +86,8 @@ public void clearParameters() throws SQLException { @Override public boolean execute() throws SQLException { - ResultSet rs = executeImmediate(sql); - return rs != null; + resultSet = executeImmediate(sql); + return resultSet != null; } @Override diff --git a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoStatement.java b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoStatement.java index 57cd066463..0a0a849440 100644 --- a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoStatement.java +++ b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoStatement.java @@ -40,11 +40,6 @@ public class TajoStatement implements Statement { */ private ResultSet resultSet = null; - /** - * Add SQLWarnings to the warningChain if needed. - */ - private SQLWarning warningChain = null; - /** * Keep state so we can fail certain calls made after close(). */ @@ -71,9 +66,7 @@ public void clearBatch() throws SQLException { } @Override - public void clearWarnings() throws SQLException { - warningChain = null; - } + public void clearWarnings() throws SQLException {} @Override public void close() throws SQLException { @@ -219,6 +212,8 @@ public int executeUpdate(String sql, String[] columnNames) throws SQLException { @Override public Connection getConnection() throws SQLException { + if (isClosed) + throw new SQLException("Can't get connection after statement has been closed"); return conn; } @@ -229,6 +224,8 @@ public int getFetchDirection() throws SQLException { @Override public int getFetchSize() throws SQLException { + if (isClosed) + throw new SQLException("Can't get fetch size after statement has been closed"); return fetchSize; } @@ -264,6 +261,8 @@ public int getQueryTimeout() throws SQLException { @Override public ResultSet getResultSet() throws SQLException { + if (isClosed) + throw new SQLException("Can't get result set after statement has been closed"); return resultSet; } @@ -284,12 +283,16 @@ public int getResultSetType() throws SQLException { @Override public int getUpdateCount() throws SQLException { + if (isClosed) + throw new SQLException("Can't get update count after statement has been closed"); return 0; } @Override public SQLWarning getWarnings() throws SQLException { - return warningChain; + if (isClosed) + throw new SQLException("Can't get warnings after statement has been closed"); + return null; } @Override @@ -325,6 +328,8 @@ public void setFetchDirection(int direction) throws SQLException { @Override public void setFetchSize(int rows) throws SQLException { + if (isClosed) + throw new SQLException("Can't set fetch size after statement has been closed"); fetchSize = rows; } From c3b7d1354a5c50cd7a6d2950b671a04b30310c26 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 3 Apr 2015 11:11:10 +0900 Subject: [PATCH 007/141] TAJO-1501: Too many log message of HashShuffleAppenderManager. Signed-off-by: JaeHwa Jung --- CHANGES | 3 +++ .../org/apache/tajo/storage/HashShuffleAppenderManager.java | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index 56cb020456..9a46f68231 100644 --- a/CHANGES +++ b/CHANGES @@ -9,6 +9,9 @@ Release 0.10.1 - unreleased IMPROVEMENT + TAJO-1501: Too many log message of HashShuffleAppenderManager. + (Contributed by Jongyoung Park. Committed by jaehwa) + TAJO-1395: Remove deprecated sql files for Oracle and PostgreSQL. (jihun) TAJO-1394: Support reconnect on tsql. diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java index 466c6c7af0..d2e9b4dbd5 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/HashShuffleAppenderManager.java @@ -96,7 +96,9 @@ public HashShuffleAppender getAppender(TajoConf tajoConf, ExecutionBlockId ebId, partitionAppenderMeta.appender.init(); partitionAppenderMap.put(partId, partitionAppenderMeta); - LOG.info("Create Hash shuffle file(partId=" + partId + "): " + dataFile); + if (LOG.isDebugEnabled()) { + LOG.debug("Create Hash shuffle file(partId=" + partId + "): " + dataFile); + } } return partitionAppenderMeta.appender; From b729a49c821a874db8f432342c2c9fc83d537793 Mon Sep 17 00:00:00 2001 From: DaeMyung Kang Date: Fri, 3 Apr 2015 11:38:57 +0900 Subject: [PATCH 008/141] TAJO-1360: VALUES_ field in OPTIONS table of catalog store should be longer. Signed-off-by: JaeHwa Jung --- CHANGES | 3 +++ .../src/main/resources/schemas/derby/derby.xml | 2 +- .../src/main/resources/schemas/mariadb/table_properties.sql | 2 +- .../src/main/resources/schemas/mysql/table_properties.sql | 2 +- .../src/main/resources/schemas/oracle/oracle.xml | 2 +- .../src/main/resources/schemas/postgresql/postgresql.xml | 2 +- 6 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGES b/CHANGES index 9a46f68231..3f7880e1de 100644 --- a/CHANGES +++ b/CHANGES @@ -24,6 +24,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1360: VALUES_ field in OPTIONS table of catalog store should be longer. + (Contributed by DaeMyung Kang, Committed by jaehwa) + TAJO-1437: Resolve findbug warnings on Tajo JDBC Module. (Contributed by Dongjoon Hyun, Committed by jihoon) diff --git a/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/derby/derby.xml b/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/derby/derby.xml index db2473be5a..0815603308 100644 --- a/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/derby/derby.xml +++ b/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/derby/derby.xml @@ -86,7 +86,7 @@ CREATE TABLE OPTIONS ( TID INT NOT NULL REFERENCES TABLES (TID) ON DELETE CASCADE, KEY_ VARCHAR(255) NOT NULL, - VALUE_ VARCHAR(255) NOT NULL, + VALUE_ VARCHAR(4000) NOT NULL, CONSTRAINT C_OPTIONS_UNIQUE UNIQUE (TID, KEY_, VALUE_) )]]> diff --git a/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/mariadb/table_properties.sql b/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/mariadb/table_properties.sql index 7ce53627eb..b9a84e000c 100644 --- a/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/mariadb/table_properties.sql +++ b/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/mariadb/table_properties.sql @@ -1,7 +1,7 @@ CREATE TABLE OPTIONS ( TID INT NOT NULL, KEY_ VARCHAR(255) BINARY NOT NULL, - VALUE_ VARCHAR(255) NOT NULL, + VALUE_ VARCHAR(4000) NOT NULL, PRIMARY KEY (TID, KEY_), FOREIGN KEY (TID) REFERENCES TABLES (TID) ON DELETE CASCADE ) diff --git a/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/mysql/table_properties.sql b/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/mysql/table_properties.sql index 7ce53627eb..b9a84e000c 100644 --- a/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/mysql/table_properties.sql +++ b/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/mysql/table_properties.sql @@ -1,7 +1,7 @@ CREATE TABLE OPTIONS ( TID INT NOT NULL, KEY_ VARCHAR(255) BINARY NOT NULL, - VALUE_ VARCHAR(255) NOT NULL, + VALUE_ VARCHAR(4000) NOT NULL, PRIMARY KEY (TID, KEY_), FOREIGN KEY (TID) REFERENCES TABLES (TID) ON DELETE CASCADE ) diff --git a/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/oracle/oracle.xml b/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/oracle/oracle.xml index 8945fcad83..633c88ab1b 100644 --- a/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/oracle/oracle.xml +++ b/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/oracle/oracle.xml @@ -129,7 +129,7 @@ CREATE TABLE OPTIONS ( TID INT NOT NULL, KEY_ VARCHAR2(255) NOT NULL, - VALUE_ VARCHAR2(255) NOT NULL, + VALUE_ VARCHAR2(4000) NOT NULL, CONSTRAINT OPTIONS_PKEY PRIMARY KEY (TID, KEY_), FOREIGN KEY (TID) REFERENCES TABLES (TID) ON DELETE CASCADE )]]> diff --git a/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/postgresql/postgresql.xml b/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/postgresql/postgresql.xml index 8e5cbccf56..a67af522e4 100644 --- a/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/postgresql/postgresql.xml +++ b/tajo-catalog/tajo-catalog-server/src/main/resources/schemas/postgresql/postgresql.xml @@ -88,7 +88,7 @@ xsi:schemaLocation="http://tajo.apache.org/catalogstore ../DBMSSchemaDefinition. CREATE TABLE OPTIONS ( TID INT NOT NULL, KEY_ VARCHAR(255) NOT NULL, - VALUE_ VARCHAR(255) NOT NULL, + VALUE_ VARCHAR(4000) NOT NULL, CONSTRAINT OPTIONS_PKEY PRIMARY KEY (TID, KEY_), FOREIGN KEY (TID) REFERENCES TABLES (TID) ON DELETE CASCADE )]]> From 8173bc1f491f45050b3611868d1d8d0099056d0c Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sat, 4 Apr 2015 19:00:31 +0900 Subject: [PATCH 009/141] TAJO-1492: Replace CSV examples into TEXT examples in docs. Signed-off-by: Jihoon Son --- CHANGES | 3 ++ .../sphinx/backup_and_restore/catalog.rst | 2 +- tajo-docs/src/main/sphinx/getting_started.rst | 2 +- .../src/main/sphinx/sql_language/ddl.rst | 2 +- .../sphinx/table_management/file_formats.rst | 2 +- .../table_management/table_overview.rst | 6 +-- .../table_management/{csv.rst => text.rst} | 46 +++++++++---------- 7 files changed, 33 insertions(+), 30 deletions(-) rename tajo-docs/src/main/sphinx/table_management/{csv.rst => text.rst} (72%) diff --git a/CHANGES b/CHANGES index 3f7880e1de..08ac729068 100644 --- a/CHANGES +++ b/CHANGES @@ -69,6 +69,9 @@ Release 0.10.1 - unreleased TASKS + TAJO-1462: Replace CSV examples into TEXT examples in docs. + (Contributed by Dongjoon Hyun, Committed by jihoon) + TAJO-1439: Some method name is written wrongly. (Contributed by Jongyoung Park. Committed by jihoon) diff --git a/tajo-docs/src/main/sphinx/backup_and_restore/catalog.rst b/tajo-docs/src/main/sphinx/backup_and_restore/catalog.rst index 200aa850cc..1c2b7096b6 100644 --- a/tajo-docs/src/main/sphinx/backup_and_restore/catalog.rst +++ b/tajo-docs/src/main/sphinx/backup_and_restore/catalog.rst @@ -28,7 +28,7 @@ For example, if you want to backup a table customer, you should type a command a -- Name: customer; Type: TABLE; Storage: CSV -- Path: file:/home/hyunsik/tpch/customer -- - CREATE EXTERNAL TABLE customer (c_custkey INT8, c_name TEXT, c_address TEXT, c_nationkey INT8, c_phone TEXT, c_acctbal FLOAT8, c_mktsegment TEXT, c_comment TEXT) USING CSV LOCATION 'file:/home/hyunsik/tpch/customer'; + CREATE EXTERNAL TABLE customer (c_custkey INT8, c_name TEXT, c_address TEXT, c_nationkey INT8, c_phone TEXT, c_acctbal FLOAT8, c_mktsegment TEXT, c_comment TEXT) USING TEXT LOCATION 'file:/home/hyunsik/tpch/customer'; If you want to restore the catalog from the SQL dump file, please type the below command: :: diff --git a/tajo-docs/src/main/sphinx/getting_started.rst b/tajo-docs/src/main/sphinx/getting_started.rst index eaf6973827..e30c3fef28 100644 --- a/tajo-docs/src/main/sphinx/getting_started.rst +++ b/tajo-docs/src/main/sphinx/getting_started.rst @@ -135,7 +135,7 @@ Here, we assume the schema as (int, text, float, text). :: name text, score float, type text) - using csv with ('text.delimiter'='|') location 'file:/home/x/table1'; + using text with ('text.delimiter'='|') location 'file:/home/x/table1'; To load an external table, you need to use ‘create external table’ statement. In the location clause, you should use the absolute directory path with an appropriate scheme. diff --git a/tajo-docs/src/main/sphinx/sql_language/ddl.rst b/tajo-docs/src/main/sphinx/sql_language/ddl.rst index 60b7190c20..662ccffe50 100644 --- a/tajo-docs/src/main/sphinx/sql_language/ddl.rst +++ b/tajo-docs/src/main/sphinx/sql_language/ddl.rst @@ -56,7 +56,7 @@ If you want to add an external table that contains compressed data, you should g ... L_COMMENT text) - USING csv WITH ('text.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') + USING TEXT WITH ('text.delimiter'='|','compression.codec'='org.apache.hadoop.io.compress.DeflateCodec') LOCATION 'hdfs://localhost:9010/tajo/warehouse/lineitem_100_snappy'; `compression.codec` parameter can have one of the following compression codecs: diff --git a/tajo-docs/src/main/sphinx/table_management/file_formats.rst b/tajo-docs/src/main/sphinx/table_management/file_formats.rst index c15dd3f6ee..05794979a2 100644 --- a/tajo-docs/src/main/sphinx/table_management/file_formats.rst +++ b/tajo-docs/src/main/sphinx/table_management/file_formats.rst @@ -7,7 +7,7 @@ Currently, Tajo provides four file formats as follows: .. toctree:: :maxdepth: 1 - csv + text rcfile parquet sequencefile \ No newline at end of file diff --git a/tajo-docs/src/main/sphinx/table_management/table_overview.rst b/tajo-docs/src/main/sphinx/table_management/table_overview.rst index 3d933c2eb9..3985e19cac 100644 --- a/tajo-docs/src/main/sphinx/table_management/table_overview.rst +++ b/tajo-docs/src/main/sphinx/table_management/table_overview.rst @@ -29,9 +29,9 @@ The following example is to set a custom field delimiter, NULL character, and co name text, score float, type text - ) USING CSV WITH('text.delimiter'='\u0001', - 'text.null'='\\N', - 'compression.codec'='org.apache.hadoop.io.compress.SnappyCodec'); + ) USING TEXT WITH('text.delimiter'='\u0001', + 'text.null'='\\N', + 'compression.codec'='org.apache.hadoop.io.compress.SnappyCodec'); Each physical table layout has its own specialized properties. They will be addressed in :doc:`/table_management/file_formats`. diff --git a/tajo-docs/src/main/sphinx/table_management/csv.rst b/tajo-docs/src/main/sphinx/table_management/text.rst similarity index 72% rename from tajo-docs/src/main/sphinx/table_management/csv.rst rename to tajo-docs/src/main/sphinx/table_management/text.rst index 53c6e1d276..3727b03bd9 100644 --- a/tajo-docs/src/main/sphinx/table_management/csv.rst +++ b/tajo-docs/src/main/sphinx/table_management/text.rst @@ -1,21 +1,21 @@ ************************************* -CSV (TextFile) +TEXT ************************************* -A character-separated values (CSV) file represents a tabular data set consisting of rows and columns. +A character-separated values plain-text file represents a tabular data set consisting of rows and columns. Each row is a plan-text line. A line is usually broken by a character line feed ``\n`` or carriage-return ``\r``. The line feed ``\n`` is the default delimiter in Tajo. Each record consists of multiple fields, separated by some other character or string, most commonly a literal vertical bar ``|``, comma ``,`` or tab ``\t``. The vertical bar is used as the default field delimiter in Tajo. ========================================= -How to Create a CSV Table ? +How to Create a TEXT Table ? ========================================= If you are not familiar with the ``CREATE TABLE`` statement, please refer to the Data Definition Language :doc:`/sql_language/ddl`. In order to specify a certain file format for your table, you need to use the ``USING`` clause in your ``CREATE TABLE`` -statement. The below is an example statement for creating a table using CSV files. +statement. The below is an example statement for creating a table using *TEXT* format. .. code-block:: sql @@ -25,7 +25,7 @@ statement. The below is an example statement for creating a table using CSV file name text, score float, type text - ) USING CSV; + ) USING TEXT; ========================================= Physical Properties @@ -34,19 +34,19 @@ Physical Properties Some table storage formats provide parameters for enabling or disabling features and adjusting physical parameters. The ``WITH`` clause in the CREATE TABLE statement allows users to set those parameters. -Now, the CSV storage format provides the following physical properties. +*TEXT* format provides the following physical properties. * ``text.delimiter``: delimiter character. ``|`` or ``\u0001`` is usually used, and the default field delimiter is ``|``. -* ``text.null``: NULL character. The default NULL character is an empty string ``''``. Hive's default NULL character is ``'\\N'``. +* ``text.null``: ``NULL`` character. The default ``NULL`` character is an empty string ``''``. Hive's default ``NULL`` character is ``'\\N'``. * ``compression.codec``: Compression codec. You can enable compression feature and set specified compression algorithm. The compression algorithm used to compress files. The compression codec name should be the fully qualified class name inherited from `org.apache.hadoop.io.compress.CompressionCodec `_. By default, compression is disabled. -* ``csvfile.serde`` (deprecated): custom (De)serializer class. ``org.apache.tajo.storage.TextSerializerDeserializer`` is the default (De)serializer class. +* ``text.serde``: custom (De)serializer class. ``org.apache.tajo.storage.text.CSVLineSerDe`` is the default (De)serializer class. * ``timezone``: the time zone that the table uses for writting. When table rows are read or written, ```timestamp``` and ```time``` column values are adjusted by this timezone if it is set. Time zone can be an abbreviation form like 'PST' or 'DST'. Also, it accepts an offset-based form like 'UTC+9' or a location-based form like 'Asia/Seoul'. * ``text.error-tolerance.max-num``: the maximum number of permissible parsing errors. This value should be an integer value. By default, ``text.error-tolerance.max-num`` is ``0``. According to the value, parsing errors will be handled in different ways. * If ``text.error-tolerance.max-num < 0``, all parsing errors are ignored. * If ``text.error-tolerance.max-num == 0``, any parsing error is not allowed. If any error occurs, the query will be failed. (default) * If ``text.error-tolerance.max-num > 0``, the given number of parsing errors in each task will be pemissible. -The following example is to set a custom field delimiter, NULL character, and compression codec: +The following example is to set a custom field delimiter, ``NULL`` character, and compression codec: .. code-block:: sql @@ -55,24 +55,24 @@ The following example is to set a custom field delimiter, NULL character, and co name text, score float, type text - ) USING CSV WITH('text.delimiter'='\u0001', - 'text.null'='\\N', - 'compression.codec'='org.apache.hadoop.io.compress.SnappyCodec'); + ) USING TEXT WITH('text.delimiter'='\u0001', + 'text.null'='\\N', + 'compression.codec'='org.apache.hadoop.io.compress.SnappyCodec'); .. warning:: - Be careful when using ``\n`` as the field delimiter because CSV uses ``\n`` as the line delimiter. + Be careful when using ``\n`` as the field delimiter because *TEXT* format tables use ``\n`` as the line delimiter. At the moment, Tajo does not provide a way to specify the line delimiter. ========================================= Custom (De)serializer ========================================= -The CSV storage format not only provides reading and writing interfaces for CSV data but also allows users to process custom +The *TEXT* format not only provides reading and writing interfaces for text data but also allows users to process custom plan-text file formats with user-defined (De)serializer classes. For example, with custom (de)serializers, Tajo can process JSON file formats or any specialized plan-text file formats. -In order to specify a custom (De)serializer, set a physical property ``csvfile.serde``. +In order to specify a custom (De)serializer, set a physical property ``text.serde``. The property value should be a fully qualified class name. For example: @@ -84,25 +84,25 @@ For example: name text, score float, type text - ) USING CSV WITH ('csvfile.serde'='org.my.storage.CustomSerializerDeserializer') + ) USING TEXT WITH ('text.serde'='org.my.storage.CustomSerializerDeserializer') ========================================= Null Value Handling Issues ========================================= -In default, NULL character in CSV files is an empty string ``''``. -In other words, an empty field is basically recognized as a NULL value in Tajo. -If a field domain is ``TEXT``, an empty field is recognized as a string value ``''`` instead of NULL value. -Besides, You can also use your own NULL character by specifying a physical property ``text.null``. +In default, ``NULL`` character in *TEXT* format is an empty string ``''``. +In other words, an empty field is basically recognized as a ``NULL`` value in Tajo. +If a field domain is ``TEXT``, an empty field is recognized as a string value ``''`` instead of ``NULL`` value. +Besides, You can also use your own ``NULL`` character by specifying a physical property ``text.null``. ========================================= Compatibility Issues with Apache Hive™ ========================================= -CSV files generated in Tajo can be processed directly by Apache Hive™ without further processing. +*TEXT* tables generated in Tajo can be processed directly by Apache Hive™ without further processing. In this section, we explain some compatibility issue for users who use both Hive and Tajo. -If you set a custom field delimiter, the CSV tables cannot be directly used in Hive. +If you set a custom field delimiter, the *TEXT* tables cannot be directly used in Hive. In order to specify the custom field delimiter in Hive, you need to use ``ROW FORMAT DELIMITED FIELDS TERMINATED BY`` clause in a Hive's ``CREATE TABLE`` statement as follows: @@ -112,4 +112,4 @@ clause in a Hive's ``CREATE TABLE`` statement as follows: ROW FORMAT DELIMITED FIELDS TERMINATED BY '|' STORED AS TEXT -To the best of our knowledge, there is not way to specify a custom NULL character in Hive. +To the best of our knowledge, there is not way to specify a custom ``NULL`` character in Hive. From cb6965977e0939e97e87efaf667be75c14d70d49 Mon Sep 17 00:00:00 2001 From: YeonSu Han Date: Sun, 5 Apr 2015 23:27:05 +0900 Subject: [PATCH 010/141] TAJO-1400: Add TajoStatement::setMaxRows method support. Signed-off-by: Jihoon Son --- CHANGES | 3 ++ .../org/apache/tajo/client/QueryClient.java | 4 ++ .../apache/tajo/client/QueryClientImpl.java | 13 ++++++ .../apache/tajo/client/TajoClientImpl.java | 8 ++++ .../org/apache/tajo/jdbc/FetchResultSet.java | 5 ++- .../org/apache/tajo/jdbc/TestTajoJdbc.java | 40 ++++++++++++++++++- .../org/apache/tajo/jdbc/TajoStatement.java | 7 +++- 7 files changed, 76 insertions(+), 4 deletions(-) diff --git a/CHANGES b/CHANGES index 08ac729068..3d2b41f3e9 100644 --- a/CHANGES +++ b/CHANGES @@ -9,6 +9,9 @@ Release 0.10.1 - unreleased IMPROVEMENT + TAJO-1400: Add TajoStatement::setMaxRows method support. + (Contributed by YeonSu Han, Committed by jihoon) + TAJO-1501: Too many log message of HashShuffleAppenderManager. (Contributed by Jongyoung Park. Committed by jaehwa) diff --git a/tajo-client/src/main/java/org/apache/tajo/client/QueryClient.java b/tajo-client/src/main/java/org/apache/tajo/client/QueryClient.java index 7c7db33656..39b5fc3782 100644 --- a/tajo-client/src/main/java/org/apache/tajo/client/QueryClient.java +++ b/tajo-client/src/main/java/org/apache/tajo/client/QueryClient.java @@ -45,6 +45,10 @@ public interface QueryClient extends Closeable { public Map getClientSideSessionVars(); public String getBaseDatabase(); + + public void setMaxRows(int maxRows); + + public int getMaxRows(); @Override public void close(); diff --git a/tajo-client/src/main/java/org/apache/tajo/client/QueryClientImpl.java b/tajo-client/src/main/java/org/apache/tajo/client/QueryClientImpl.java index fae613a0c6..4444a31c85 100644 --- a/tajo-client/src/main/java/org/apache/tajo/client/QueryClientImpl.java +++ b/tajo-client/src/main/java/org/apache/tajo/client/QueryClientImpl.java @@ -51,11 +51,14 @@ public class QueryClientImpl implements QueryClient { private static final Log LOG = LogFactory.getLog(QueryClientImpl.class); private final SessionConnection connection; private final int defaultFetchRows; +//maxRows number is limit value of resultSet. The value must be >= 0, and 0 means there is not limit. + private int maxRows; public QueryClientImpl(SessionConnection connection) { this.connection = connection; this.defaultFetchRows = this.connection.getProperties().getInt(SessionVars.FETCH_ROWNUM.getConfVars().keyname(), SessionVars.FETCH_ROWNUM.getConfVars().defaultIntVal); + this.maxRows = 0; } @Override @@ -577,6 +580,16 @@ public QueryStatus killQuery(final QueryId queryId) return status; } + @Override + public void setMaxRows(int maxRows) { + this.maxRows = maxRows; + } + + @Override + public int getMaxRows() { + return this.maxRows; + } + public QueryInfoProto getQueryInfo(final QueryId queryId) throws ServiceException { return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), TajoMasterClientProtocol.class, false, true) { diff --git a/tajo-client/src/main/java/org/apache/tajo/client/TajoClientImpl.java b/tajo-client/src/main/java/org/apache/tajo/client/TajoClientImpl.java index e61bea0663..612b56e528 100644 --- a/tajo-client/src/main/java/org/apache/tajo/client/TajoClientImpl.java +++ b/tajo-client/src/main/java/org/apache/tajo/client/TajoClientImpl.java @@ -166,6 +166,14 @@ public QueryHistoryProto getQueryHistory(final QueryId queryId) throws ServiceEx return queryClient.getQueryHistory(queryId); } + public void setMaxRows(int maxRows) { + queryClient.setMaxRows(maxRows); + } + + public int getMaxRows() { + return queryClient.getMaxRows(); + } + /*------------------------------------------------------------------------*/ // CatalogClient wrappers /*------------------------------------------------------------------------*/ diff --git a/tajo-client/src/main/java/org/apache/tajo/jdbc/FetchResultSet.java b/tajo-client/src/main/java/org/apache/tajo/jdbc/FetchResultSet.java index 06773f4970..efe070eeeb 100644 --- a/tajo-client/src/main/java/org/apache/tajo/jdbc/FetchResultSet.java +++ b/tajo-client/src/main/java/org/apache/tajo/jdbc/FetchResultSet.java @@ -32,10 +32,13 @@ public class FetchResultSet extends TajoResultSetBase { private int fetchRowNum; private TajoMemoryResultSet currentResultSet; private boolean finished = false; +// maxRows number is limit value of resultSet. The value must be >= 0, and 0 means there is not limit. + private int maxRows; public FetchResultSet(QueryClient tajoClient, Schema schema, QueryId queryId, int fetchRowNum) { super(tajoClient.getClientSideSessionVars()); this.tajoClient = tajoClient; + this.maxRows = tajoClient.getMaxRows(); this.queryId = queryId; this.fetchRowNum = fetchRowNum; this.totalRow = Integer.MAX_VALUE; @@ -48,7 +51,7 @@ public QueryId getQueryId() { @Override protected Tuple nextTuple() throws IOException { - if (finished) { + if (finished || (maxRows > 0 && curRow >= maxRows)) { return null; } diff --git a/tajo-core/src/test/java/org/apache/tajo/jdbc/TestTajoJdbc.java b/tajo-core/src/test/java/org/apache/tajo/jdbc/TestTajoJdbc.java index db6192cb97..36bbd942df 100644 --- a/tajo-core/src/test/java/org/apache/tajo/jdbc/TestTajoJdbc.java +++ b/tajo-core/src/test/java/org/apache/tajo/jdbc/TestTajoJdbc.java @@ -635,4 +635,42 @@ public void testAlterTableAddPartition() throws Exception { } } } -} \ No newline at end of file + + @Test + public void testMaxRows() throws Exception { + String connUri = buildConnectionUri(tajoMasterAddress.getHostName(), tajoMasterAddress.getPort(), + DEFAULT_DATABASE_NAME); + Connection conn = DriverManager.getConnection(connUri); + assertTrue(conn.isValid(100)); + Statement stmt = null; + ResultSet res = null; + //Parameter value setting for test. + final int maxRowsNum = 3; + int resultRowsNum = 0, returnMaxRows = 0; + try { + stmt = conn.createStatement(); + //set maxRows(3) + stmt.setMaxRows(maxRowsNum); + //get MaxRows + returnMaxRows = stmt.getMaxRows(); + res = stmt.executeQuery("select * from lineitem"); + assertNotNull(res); + while (res.next()) { + //Actuality result Rows. + resultRowsNum++; + } + //The test success, if maxRowsNum and resultRowsNum and returnMaxRows is same. + assertTrue(maxRowsNum == resultRowsNum && maxRowsNum == returnMaxRows); + } finally { + if (res != null) { + cleanupQuery(res); + } + if (stmt != null) { + stmt.close(); + } + if (conn != null) { + conn.close(); + } + } + } +} diff --git a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoStatement.java b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoStatement.java index 0a0a849440..820e350563 100644 --- a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoStatement.java +++ b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/TajoStatement.java @@ -241,7 +241,7 @@ public int getMaxFieldSize() throws SQLException { @Override public int getMaxRows() throws SQLException { - throw new SQLFeatureNotSupportedException("getMaxRows not supported"); + return tajoClient.getMaxRows() ; } @Override @@ -340,7 +340,10 @@ public void setMaxFieldSize(int max) throws SQLException { @Override public void setMaxRows(int max) throws SQLException { - throw new SQLFeatureNotSupportedException("setMaxRows not supported"); + if (max < 0) { + throw new SQLException("max must be >= 0"); + } + tajoClient.setMaxRows(max); } @Override From 7f7b132884d2d1dede71428fb68f44c08e3fbb05 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Tue, 7 Apr 2015 16:13:23 +0900 Subject: [PATCH 011/141] TAJO-1529: Implement json_extract_path_text(string, string) function. --- CHANGES | 3 + tajo-core/pom.xml | 8 ++ .../function/json/JsonExtractPathText.java | 90 +++++++++++++++++++ .../engine/function/TestJsonFunctions.java | 36 ++++++++ tajo-docs/src/main/sphinx/functions.rst | 3 +- .../src/main/sphinx/functions/json_func.rst | 16 ++++ tajo-project/pom.xml | 10 +++ tajo-storage/tajo-storage-hdfs/pom.xml | 1 - .../storage/json/JsonLineDeserializer.java | 26 +----- .../tajo/storage/json/JsonLineSerializer.java | 3 +- .../text/TextFieldSerializerDeserializer.java | 12 +-- 11 files changed, 177 insertions(+), 31 deletions(-) create mode 100644 tajo-core/src/main/java/org/apache/tajo/engine/function/json/JsonExtractPathText.java create mode 100644 tajo-core/src/test/java/org/apache/tajo/engine/function/TestJsonFunctions.java create mode 100644 tajo-docs/src/main/sphinx/functions/json_func.rst diff --git a/CHANGES b/CHANGES index 3d2b41f3e9..977262a248 100644 --- a/CHANGES +++ b/CHANGES @@ -89,6 +89,9 @@ Release 0.10.1 - unreleased SUB TASKS + TAJO-1529: Implement json_extract_path_text(string, string) function. + (jinho) + TAJO-1353: Nested record support in CREATE TABLE statement. (hyunsik) diff --git a/tajo-core/pom.xml b/tajo-core/pom.xml index 743180fc0a..42b143b1ad 100644 --- a/tajo-core/pom.xml +++ b/tajo-core/pom.xml @@ -454,6 +454,14 @@ jcip-annotations test + + net.minidev + json-smart + + + com.jayway.jsonpath + json-path + diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/function/json/JsonExtractPathText.java b/tajo-core/src/main/java/org/apache/tajo/engine/function/json/JsonExtractPathText.java new file mode 100644 index 0000000000..f0603bec07 --- /dev/null +++ b/tajo-core/src/main/java/org/apache/tajo/engine/function/json/JsonExtractPathText.java @@ -0,0 +1,90 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +package org.apache.tajo.engine.function.json; + +import com.jayway.jsonpath.JsonPath; +import net.minidev.json.JSONObject; +import net.minidev.json.parser.JSONParser; +import org.apache.tajo.catalog.Column; +import org.apache.tajo.common.TajoDataTypes; +import org.apache.tajo.datum.Datum; +import org.apache.tajo.datum.DatumFactory; +import org.apache.tajo.datum.NullDatum; +import org.apache.tajo.engine.function.annotation.Description; +import org.apache.tajo.engine.function.annotation.ParamTypes; +import org.apache.tajo.plan.function.GeneralFunction; +import org.apache.tajo.storage.Tuple; + +/** + * json_extract_path_text(string, string) - + * Extracts JSON string from a JSON string based on json path specified, + * and returns JSON string pointed to by xPath. + * + * + * Returns null if either argument is null. + * + * Example: + * SELECT json_extract_path_text('{"sample" : {"name" : "tajo"}}','$.sample.name') FROM src LIMIT 1;\n" + * -> result: 'tajo' + */ +@Description( + functionName = "json_extract_path_text", + description = "Returns JSON string pointed to by xPath", + detail = "Extracts JSON string from a JSON string based on json path specified,\n" + + "and returns JSON string pointed to by xPath.", + example = "> SELECT json_extract_path_text('{\"sample\" : {\"name\" : \"tajo\"}}','$.sample.name');\n" + + "tajo", + returnType = TajoDataTypes.Type.TEXT, + paramTypes = {@ParamTypes(paramTypes = {TajoDataTypes.Type.TEXT, TajoDataTypes.Type.TEXT})} +) +public class JsonExtractPathText extends GeneralFunction { + private JSONParser parser; + private JsonPath jsonPath; + + public JsonExtractPathText() { + super(new Column[]{ + new Column("string", TajoDataTypes.Type.TEXT), + new Column("string", TajoDataTypes.Type.TEXT), + }); + parser = new JSONParser(JSONParser.MODE_JSON_SIMPLE | JSONParser.IGNORE_CONTROL_CHAR); + } + + @Override + public Datum eval(Tuple params) { + Datum json = params.get(0); + Datum xPath = params.get(1); + + if (json instanceof NullDatum || xPath instanceof NullDatum) { + return NullDatum.get(); + } + + // default is JsonSmartMappingProvider + try { + + JSONObject object = (JSONObject) parser.parse(json.asTextBytes()); + if (jsonPath == null) { + jsonPath = JsonPath.compile(xPath.asChars()); + } + return DatumFactory.createText(jsonPath.read(object).toString()); + } catch (Exception e) { + return NullDatum.get(); + } + } +} diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/function/TestJsonFunctions.java b/tajo-core/src/test/java/org/apache/tajo/engine/function/TestJsonFunctions.java new file mode 100644 index 0000000000..89f0439df9 --- /dev/null +++ b/tajo-core/src/test/java/org/apache/tajo/engine/function/TestJsonFunctions.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.engine.function; + + +import org.apache.tajo.engine.eval.ExprTestBase; +import org.junit.Test; + +import java.io.IOException; + +public class TestJsonFunctions extends ExprTestBase { + static final String JSON_DOCUMENT = "{\"map\" : {\"name\" : \"tajo\"}, \"array\" : [1,2,3]}"; + + @Test + public void testJsonExtractPathText() throws IOException { + testSimpleEval("select json_extract_path_text('" + JSON_DOCUMENT + "', '$.map.name') ", new String[]{"tajo"}); + testSimpleEval("select json_extract_path_text('" + JSON_DOCUMENT + "', '$.array[1]') ", new String[]{"2"}); + + } +} diff --git a/tajo-docs/src/main/sphinx/functions.rst b/tajo-docs/src/main/sphinx/functions.rst index fb93d1ebd0..453edf4804 100644 --- a/tajo-docs/src/main/sphinx/functions.rst +++ b/tajo-docs/src/main/sphinx/functions.rst @@ -8,4 +8,5 @@ Functions functions/math_func_and_operators functions/string_func_and_operators functions/datetime_func_and_operators - functions/network_func_and_operators \ No newline at end of file + functions/network_func_and_operators + functions/json_func \ No newline at end of file diff --git a/tajo-docs/src/main/sphinx/functions/json_func.rst b/tajo-docs/src/main/sphinx/functions/json_func.rst new file mode 100644 index 0000000000..5bf5814a8f --- /dev/null +++ b/tajo-docs/src/main/sphinx/functions/json_func.rst @@ -0,0 +1,16 @@ +******************************* +JSON Functions +******************************* + +.. function:: json_extract_path_text (string json, string xpath) + Extracts JSON string from a JSON string based on json path specified and returns JSON string pointed to by xPath + + :param string: + :param string: + :rtype: text + :example: + + .. code-block:: sql + + json_extract_path_text('{"test" : {"key" : "tajo"}}','$.test.key'); + > tajo diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml index cec93e38db..d9685ee670 100644 --- a/tajo-project/pom.xml +++ b/tajo-project/pom.xml @@ -1063,6 +1063,16 @@ jcip-annotations 1.0-1 + + net.minidev + json-smart + 2.1.1 + + + com.jayway.jsonpath + json-path + 2.0.0 + diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 33db33bf2c..33b3bc7191 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -351,7 +351,6 @@ limitations under the License. net.minidev json-smart - 2.0 diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java index a7e02a4b35..204f607a72 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineDeserializer.java @@ -20,10 +20,10 @@ import io.netty.buffer.ByteBuf; -import net.minidev.json.JSONArray; import net.minidev.json.JSONObject; import net.minidev.json.parser.JSONParser; import net.minidev.json.parser.ParseException; +import org.apache.commons.net.util.Base64; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.SchemaUtil; import org.apache.tajo.catalog.TableMeta; @@ -37,7 +37,6 @@ import org.apache.tajo.storage.text.TextLineParsingError; import java.io.IOException; -import java.util.Iterator; public class JsonLineDeserializer extends TextLineDeserializer { private JSONParser parser; @@ -174,31 +173,14 @@ public void deserialize(ByteBuf buf, Tuple output) throws IOException, TextLineP case BINARY: case VARBINARY: case BLOB: { - Object jsonObject = object.get(fieldName); + Object jsonObject = object.getAsString(fieldName); if (jsonObject == null) { output.put(actualIdx, NullDatum.get()); break; } - if (jsonObject instanceof String) { - output.put(actualIdx, DatumFactory.createBlob((String) jsonObject)); - } else if (jsonObject instanceof JSONArray) { - JSONArray jsonArray = (JSONArray) jsonObject; - byte[] bytes = new byte[jsonArray.size()]; - Iterator it = jsonArray.iterator(); - int arrayIdx = 0; - while (it.hasNext()) { - bytes[arrayIdx++] = ((Long) it.next()).byteValue(); - } - if (bytes.length > 0) { - output.put(actualIdx, DatumFactory.createBlob(bytes)); - } else { - output.put(actualIdx, NullDatum.get()); - } - break; - } else { - throw new IOException("Unknown json object: " + object.getClass().getSimpleName()); - } + + output.put(actualIdx, DatumFactory.createBlob(Base64.decodeBase64((String) jsonObject))); break; } case INET4: diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java index cd31ada558..d6faf2df40 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/json/JsonLineSerializer.java @@ -20,6 +20,7 @@ import net.minidev.json.JSONObject; +import org.apache.commons.net.util.Base64; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.SchemaUtil; import org.apache.tajo.catalog.TableMeta; @@ -106,7 +107,7 @@ public int serialize(OutputStream out, Tuple input) throws IOException { case BINARY: case BLOB: case VARBINARY: - jsonObject.put(fieldName, input.getBytes(i)); + jsonObject.put(fieldName, Base64.encodeBase64String(input.getBytes(i))); break; case NULL_TYPE: diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java index ae7565d464..e637c7f0cb 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java @@ -26,11 +26,11 @@ import org.apache.tajo.catalog.Column; import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.common.TajoDataTypes; -import org.apache.tajo.conf.TajoConf; import org.apache.tajo.datum.*; import org.apache.tajo.datum.protobuf.ProtobufJsonFormat; import org.apache.tajo.storage.FieldSerializerDeserializer; import org.apache.tajo.storage.StorageConstants; +import org.apache.tajo.util.Bytes; import org.apache.tajo.util.NumberUtil; import java.io.IOException; @@ -39,8 +39,8 @@ import java.util.TimeZone; public class TextFieldSerializerDeserializer implements FieldSerializerDeserializer { - public static final byte[] trueBytes = "true".getBytes(); - public static final byte[] falseBytes = "false".getBytes(); + private static final byte[] trueBytes = "true".getBytes(Bytes.UTF8_CHARSET); + private static final byte[] falseBytes = "false".getBytes(Bytes.UTF8_CHARSET); private static ProtobufJsonFormat protobufJsonFormat = ProtobufJsonFormat.getInstance(); private final CharsetDecoder decoder = CharsetUtil.getDecoder(CharsetUtil.UTF_8); @@ -108,7 +108,7 @@ public int serialize(OutputStream out, Datum datum, Column col, int columnIndex, break; case TIME: if (hasTimezone) { - bytes = ((TimeDatum) datum).asChars(timezone, true).getBytes(); + bytes = ((TimeDatum) datum).asChars(timezone, true).getBytes(Bytes.UTF8_CHARSET); } else { bytes = datum.asTextBytes(); } @@ -117,7 +117,7 @@ public int serialize(OutputStream out, Datum datum, Column col, int columnIndex, break; case TIMESTAMP: if (hasTimezone) { - bytes = ((TimestampDatum) datum).asChars(timezone, true).getBytes(); + bytes = ((TimestampDatum) datum).asChars(timezone, true).getBytes(Bytes.UTF8_CHARSET); } else { bytes = datum.asTextBytes(); } @@ -132,7 +132,7 @@ public int serialize(OutputStream out, Datum datum, Column col, int columnIndex, break; case PROTOBUF: ProtobufDatum protobuf = (ProtobufDatum) datum; - byte[] protoBytes = protobufJsonFormat.printToString(protobuf.get()).getBytes(); + byte[] protoBytes = protobufJsonFormat.printToString(protobuf.get()).getBytes(Bytes.UTF8_CHARSET); length = protoBytes.length; out.write(protoBytes, 0, protoBytes.length); break; From e75b928fe92b00c4cf635abfc877fecd3b7e31e1 Mon Sep 17 00:00:00 2001 From: "navis.ryu" Date: Tue, 7 Apr 2015 22:24:34 +0900 Subject: [PATCH 012/141] TAJO-1538: TajoWorkerResourceManager.allocatedResourceMap is increasing forever. Signed-off-by: Jinho Kim --- CHANGES | 3 +++ .../org/apache/tajo/master/rm/TajoWorkerResourceManager.java | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index 977262a248..5b1b2b6dae 100644 --- a/CHANGES +++ b/CHANGES @@ -27,6 +27,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1538: TajoWorkerResourceManager.allocatedResourceMap is increasing + forever. (Contributed by navis. Committed by jinho) + TAJO-1360: VALUES_ field in OPTIONS table of catalog store should be longer. (Contributed by DaeMyung Kang, Committed by jaehwa) diff --git a/tajo-core/src/main/java/org/apache/tajo/master/rm/TajoWorkerResourceManager.java b/tajo-core/src/main/java/org/apache/tajo/master/rm/TajoWorkerResourceManager.java index 0d830eaa34..541b1b6c81 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/rm/TajoWorkerResourceManager.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/rm/TajoWorkerResourceManager.java @@ -508,7 +508,7 @@ private List chooseWorkers(WorkerResourceRequest resour */ @Override public void releaseWorkerResource(ContainerProtocol.TajoContainerIdProto containerId) { - AllocatedWorkerResource allocated = allocatedResourceMap.get(containerId); + AllocatedWorkerResource allocated = allocatedResourceMap.remove(containerId); if(allocated != null) { LOG.info("Release Resource: " + allocated.allocatedDiskSlots + "," + allocated.allocatedMemoryMB); allocated.worker.getResource().releaseResource( allocated.allocatedDiskSlots, allocated.allocatedMemoryMB); From 75e985eaca8146020b2b9d79e609ba1ed041998f Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Sat, 18 Apr 2015 14:53:05 +0900 Subject: [PATCH 013/141] TAJO-1564: TestFetcher fails occasionally. (jinho) --- CHANGES | 2 + .../java/org/apache/tajo/worker/Fetcher.java | 47 +++++++------------ 2 files changed, 18 insertions(+), 31 deletions(-) diff --git a/CHANGES b/CHANGES index 5b1b2b6dae..995305e85b 100644 --- a/CHANGES +++ b/CHANGES @@ -27,6 +27,8 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1564: TestFetcher fails occasionally. (jinho) + TAJO-1538: TajoWorkerResourceManager.allocatedResourceMap is increasing forever. (Contributed by navis. Committed by jinho) diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/Fetcher.java b/tajo-core/src/main/java/org/apache/tajo/worker/Fetcher.java index 31599a3de9..94488d05f8 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/Fetcher.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/Fetcher.java @@ -18,8 +18,15 @@ package org.apache.tajo.worker; +import io.netty.bootstrap.Bootstrap; +import io.netty.buffer.ByteBuf; import io.netty.buffer.PooledByteBufAllocator; import io.netty.channel.*; +import io.netty.channel.socket.nio.NioSocketChannel; +import io.netty.handler.codec.http.*; +import io.netty.handler.timeout.ReadTimeoutException; +import io.netty.handler.timeout.ReadTimeoutHandler; +import io.netty.util.ReferenceCountUtil; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.IOUtils; @@ -28,24 +35,6 @@ import org.apache.tajo.pullserver.retriever.FileChunk; import org.apache.tajo.rpc.RpcChannelFactory; -import io.netty.bootstrap.Bootstrap; -import io.netty.buffer.ByteBuf; -import io.netty.channel.socket.nio.NioSocketChannel; -import io.netty.handler.codec.http.DefaultHttpRequest; -import io.netty.handler.codec.http.HttpClientCodec; -import io.netty.handler.codec.http.HttpContent; -import io.netty.handler.codec.http.HttpContentDecompressor; -import io.netty.handler.codec.http.HttpHeaders; -import io.netty.handler.codec.http.HttpMethod; -import io.netty.handler.codec.http.HttpRequest; -import io.netty.handler.codec.http.HttpResponse; -import io.netty.handler.codec.http.HttpResponseStatus; -import io.netty.handler.codec.http.HttpVersion; -import io.netty.handler.codec.http.LastHttpContent; -import io.netty.handler.timeout.ReadTimeoutException; -import io.netty.handler.timeout.ReadTimeoutHandler; -import io.netty.util.ReferenceCountUtil; - import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; @@ -72,7 +61,7 @@ public class Fetcher { private final boolean useLocalFile; private long startTime; - private long finishTime; + private volatile long finishTime; private long fileLen; private int messageReceiveCount; private TajoProtos.FetcherState state; @@ -167,19 +156,17 @@ public FileChunk get() throws IOException { LOG.info("Status: " + getState() + ", URI:" + uri); // Send the HTTP request. - ChannelFuture channelFuture = channel.writeAndFlush(request); - - // Wait for the server to close the connection. - channel.closeFuture().awaitUninterruptibly(); + channel.writeAndFlush(request); - channelFuture.addListener(ChannelFutureListener.CLOSE); + // Wait for the server to close the connection. throw exception if failed + channel.closeFuture().syncUninterruptibly(); fileChunk.setLength(fileChunk.getFile().length()); return fileChunk; } finally { - if(future != null){ + if(future != null && future.channel().isOpen()){ // Close the channel to exit. - future.channel().close(); + future.channel().close().awaitUninterruptibly(); } this.finishTime = System.currentTimeMillis(); @@ -262,14 +249,12 @@ public void channelRead(ChannelHandlerContext ctx, Object msg) fileLen = file.length(); } - IOUtils.cleanup(LOG, fc, raf); - if (ctx.channel().isActive()) { - ctx.channel().close(); - } finishTime = System.currentTimeMillis(); if (state != TajoProtos.FetcherState.FETCH_FAILED) { state = TajoProtos.FetcherState.FETCH_FINISHED; } + + IOUtils.cleanup(LOG, fc, raf); } } catch (Exception e) { LOG.error(e.getMessage(), e); @@ -283,7 +268,7 @@ public void channelRead(ChannelHandlerContext ctx, Object msg) public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) throws Exception { if (cause instanceof ReadTimeoutException) { - LOG.warn(cause, cause); + LOG.warn(cause.getMessage(), cause); } else { LOG.error("Fetch failed :", cause); } From db3bbc9efa13c76cb47bd8067926e2f7e3920a12 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sun, 19 Apr 2015 17:39:49 +0900 Subject: [PATCH 014/141] TAJO-1567: Update old license in some pom.xml files. Signed-off-by: Jinho Kim --- CHANGES | 3 +++ tajo-algebra/pom.xml | 12 +++++----- tajo-plan/pom.xml | 12 +++++----- tajo-rpc/src/main/proto/DummyProtos.proto | 14 +++++++----- tajo-rpc/src/main/proto/RpcProtos.proto | 14 +++++++----- tajo-rpc/src/main/proto/TestProtocol.proto | 14 +++++++----- tajo-rpc/src/main/proto/TestProtos.proto | 14 +++++++----- tajo-storage/pom.xml | 12 +++++----- tajo-storage/tajo-storage-hbase/pom.xml | 26 ++++++++++++---------- tajo-storage/tajo-storage-hdfs/pom.xml | 26 ++++++++++++---------- 10 files changed, 84 insertions(+), 63 deletions(-) diff --git a/CHANGES b/CHANGES index 995305e85b..0c12ffedfc 100644 --- a/CHANGES +++ b/CHANGES @@ -77,6 +77,9 @@ Release 0.10.1 - unreleased TASKS + TAJO-1567: Update old license in some pom.xml files. + (Contributed by Dongjoon Hyun, Committed by jinho) + TAJO-1462: Replace CSV examples into TEXT examples in docs. (Contributed by Dongjoon Hyun, Committed by jihoon) diff --git a/tajo-algebra/pom.xml b/tajo-algebra/pom.xml index ae9d3a74b0..5b19e1ac9b 100644 --- a/tajo-algebra/pom.xml +++ b/tajo-algebra/pom.xml @@ -1,10 +1,12 @@ + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + --> Date: Sun, 19 Apr 2015 18:41:41 +0900 Subject: [PATCH 015/141] TAJO-1560: HashShuffle report should be ignored when a succeed tasks are not included. (jinho) --- CHANGES | 3 + .../tajo/master/TajoContainerProxy.java | 9 +- .../org/apache/tajo/querymaster/Stage.java | 133 +++++++++++------- .../tajo/util/history/HistoryWriter.java | 30 ++-- .../tajo/worker/ExecutionBlockContext.java | 47 ++++--- .../tajo/worker/TajoWorkerManagerService.java | 3 +- .../apache/tajo/worker/TaskRunnerManager.java | 16 ++- .../worker/event/TaskRunnerStartEvent.java | 10 +- .../src/main/proto/TajoWorkerProtocol.proto | 1 + .../tajo/querymaster/TestKillQuery.java | 3 +- 10 files changed, 155 insertions(+), 100 deletions(-) diff --git a/CHANGES b/CHANGES index 0c12ffedfc..ad2f0d3452 100644 --- a/CHANGES +++ b/CHANGES @@ -27,6 +27,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1560: HashShuffle report should be ignored when a succeed tasks are not + included. (jinho) + TAJO-1564: TestFetcher fails occasionally. (jinho) TAJO-1538: TajoWorkerResourceManager.allocatedResourceMap is increasing diff --git a/tajo-core/src/main/java/org/apache/tajo/master/TajoContainerProxy.java b/tajo-core/src/main/java/org/apache/tajo/master/TajoContainerProxy.java index 7ed9fc5aa0..6128df3670 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/TajoContainerProxy.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/TajoContainerProxy.java @@ -23,9 +23,7 @@ import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; import org.apache.tajo.ExecutionBlockId; import org.apache.tajo.TaskAttemptId; -import org.apache.tajo.conf.TajoConf; import org.apache.tajo.engine.query.QueryContext; -import org.apache.tajo.ha.HAServiceUtil; import org.apache.tajo.ipc.ContainerProtocol; import org.apache.tajo.ipc.QueryCoordinatorProtocol; import org.apache.tajo.ipc.TajoWorkerProtocol; @@ -34,6 +32,7 @@ import org.apache.tajo.master.event.TaskFatalErrorEvent; import org.apache.tajo.master.rm.TajoWorkerContainer; import org.apache.tajo.master.rm.TajoWorkerContainerId; +import org.apache.tajo.plan.serder.PlanProto; import org.apache.tajo.querymaster.QueryMasterTask; import org.apache.tajo.rpc.NettyClientBase; import org.apache.tajo.rpc.NullCallback; @@ -98,13 +97,14 @@ public void killTaskAttempt(TaskAttemptId taskAttemptId) { private void assignExecutionBlock(ExecutionBlockId executionBlockId, TajoContainer container) { NettyClientBase tajoWorkerRpc = null; try { - InetSocketAddress myAddr= context.getQueryMasterContext().getWorkerContext() - .getQueryMasterManagerService().getBindAddr(); InetSocketAddress addr = new InetSocketAddress(container.getNodeId().getHost(), container.getNodeId().getPort()); tajoWorkerRpc = RpcConnectionPool.getPool().getConnection(addr, TajoWorkerProtocol.class, true); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub(); + PlanProto.ShuffleType shuffleType = + context.getQuery().getStage(executionBlockId).getDataChannel().getShuffleType(); + TajoWorkerProtocol.RunExecutionBlockRequestProto request = TajoWorkerProtocol.RunExecutionBlockRequestProto.newBuilder() .setExecutionBlockId(executionBlockId.getProto()) @@ -114,6 +114,7 @@ private void assignExecutionBlock(ExecutionBlockId executionBlockId, TajoContain .setQueryOutputPath(context.getStagingDir().toString()) .setQueryContext(queryContext.getProto()) .setPlanJson(planJson) + .setShuffleType(shuffleType) .build(); tajoWorkerRpcClient.startExecutionBlock(null, request, NullCallback.get()); diff --git a/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java b/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java index 4e1f716fda..c344682fae 100644 --- a/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java +++ b/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java @@ -239,7 +239,8 @@ StageEventType.SQ_KILL, new KillTasksTransition()) EnumSet.of( StageEventType.SQ_START, StageEventType.SQ_KILL, - StageEventType.SQ_CONTAINER_ALLOCATED)) + StageEventType.SQ_CONTAINER_ALLOCATED, + StageEventType.SQ_SHUFFLE_REPORT)) // Transitions from KILLED state .addTransition(StageState.KILLED, StageState.KILLED, @@ -1299,6 +1300,53 @@ protected void stopFinalization() { stopShuffleReceiver.set(true); } + private void finalizeShuffleReport(StageShuffleReportEvent event, ShuffleType type) { + if(!checkIfNeedFinalizing(type)) return; + + TajoWorkerProtocol.ExecutionBlockReport report = event.getReport(); + + if (!report.getReportSuccess()) { + stopFinalization(); + LOG.error(getId() + ", " + type + " report are failed. Caused by:" + report.getReportErrorMessage()); + eventHandler.handle(new StageEvent(getId(), StageEventType.SQ_FAILED)); + } + + completedShuffleTasks.addAndGet(report.getSucceededTasks()); + if (report.getIntermediateEntriesCount() > 0) { + for (IntermediateEntryProto eachInterm : report.getIntermediateEntriesList()) { + hashShuffleIntermediateEntries.add(new IntermediateEntry(eachInterm)); + } + } + + if (completedShuffleTasks.get() >= succeededObjectCount) { + LOG.info(getId() + ", Finalized " + type + " reports: " + completedShuffleTasks.get()); + eventHandler.handle(new StageEvent(getId(), StageEventType.SQ_STAGE_COMPLETED)); + if (timeoutChecker != null) { + stopFinalization(); + synchronized (timeoutChecker){ + timeoutChecker.notifyAll(); + } + } + } else { + LOG.info(getId() + ", Received " + type + " reports " + + completedShuffleTasks.get() + "/" + succeededObjectCount); + } + } + + /** + * HASH_SHUFFLE, SCATTERED_HASH_SHUFFLE should get report from worker nodes when ExecutionBlock is stopping. + * RANGE_SHUFFLE report is sent from task reporter when a task finished in worker node. + */ + public static boolean checkIfNeedFinalizing(ShuffleType type) { + switch (type) { + case HASH_SHUFFLE: + case SCATTERED_HASH_SHUFFLE: + return true; + default: + return false; + } + } + private static class StageFinalizeTransition implements SingleArcTransition { @Override @@ -1309,71 +1357,50 @@ public void transition(final Stage stage, StageEvent event) { } stage.lastContactTime = System.currentTimeMillis(); + ShuffleType shuffleType = stage.getDataChannel().getShuffleType(); try { if (event instanceof StageShuffleReportEvent) { - - StageShuffleReportEvent finalizeEvent = (StageShuffleReportEvent) event; - TajoWorkerProtocol.ExecutionBlockReport report = finalizeEvent.getReport(); - - if (!report.getReportSuccess()) { - stage.stopFinalization(); - LOG.error(stage.getId() + ", Shuffle report are failed. Caused by:" + report.getReportErrorMessage()); - stage.eventHandler.handle(new StageEvent(stage.getId(), StageEventType.SQ_FAILED)); - } - - stage.completedShuffleTasks.addAndGet(finalizeEvent.getReport().getSucceededTasks()); - if (report.getIntermediateEntriesCount() > 0) { - for (IntermediateEntryProto eachInterm : report.getIntermediateEntriesList()) { - stage.hashShuffleIntermediateEntries.add(new IntermediateEntry(eachInterm)); - } - } - - if (stage.completedShuffleTasks.get() >= stage.succeededObjectCount) { - LOG.info(stage.getId() + ", Finalized shuffle reports: " + stage.completedShuffleTasks.get()); - stage.eventHandler.handle(new StageEvent(stage.getId(), StageEventType.SQ_STAGE_COMPLETED)); - if (stage.timeoutChecker != null) { - stage.stopFinalization(); - synchronized (stage.timeoutChecker){ - stage.timeoutChecker.notifyAll(); - } - } - } else { - LOG.info(stage.getId() + ", Received shuffle report: " + - stage.completedShuffleTasks.get() + "/" + stage.succeededObjectCount); - } - + stage.finalizeShuffleReport((StageShuffleReportEvent) event, shuffleType); } else { - LOG.info(String.format("Stage finalize - %s (total=%d, success=%d, killed=%d)", + LOG.info(String.format("Stage - %s finalize %s (total=%d, success=%d, killed=%d)", stage.getId().toString(), + shuffleType, stage.totalScheduledObjectsCount, stage.succeededObjectCount, stage.killedObjectCount)); stage.finalizeStage(); - LOG.info(stage.getId() + ", waiting for shuffle reports. expected Tasks:" + stage.succeededObjectCount); + if (checkIfNeedFinalizing(shuffleType)) { + /* wait for StageShuffleReportEvent from worker nodes */ + + LOG.info(stage.getId() + ", wait for " + shuffleType + " reports. expected Tasks:" + + stage.succeededObjectCount); /* FIXME implement timeout handler of stage and task */ - if (stage.timeoutChecker != null) { - stage.timeoutChecker = new Thread(new Runnable() { - @Override - public void run() { - while (stage.getSynchronizedState() == StageState.FINALIZING && !Thread.interrupted()) { - long elapsedTime = System.currentTimeMillis() - stage.lastContactTime; - if (elapsedTime > 120 * 1000) { - stage.stopFinalization(); - LOG.error(stage.getId() + ": Timed out while receiving intermediate reports: " + elapsedTime - + " ms, report:" + stage.completedShuffleTasks.get() + "/" + stage.succeededObjectCount); - stage.eventHandler.handle(new StageEvent(stage.getId(), StageEventType.SQ_FAILED)); - } - synchronized (this) { - try { - this.wait(1 * 1000); - } catch (InterruptedException e) { + if (stage.timeoutChecker != null) { + stage.timeoutChecker = new Thread(new Runnable() { + @Override + public void run() { + while (stage.getSynchronizedState() == StageState.FINALIZING && !Thread.interrupted()) { + long elapsedTime = System.currentTimeMillis() - stage.lastContactTime; + if (elapsedTime > 120 * 1000) { + stage.stopFinalization(); + LOG.error(stage.getId() + ": Timed out while receiving intermediate reports: " + elapsedTime + + " ms, report:" + stage.completedShuffleTasks.get() + "/" + stage.succeededObjectCount); + stage.eventHandler.handle(new StageEvent(stage.getId(), StageEventType.SQ_FAILED)); + } + synchronized (this) { + try { + this.wait(1 * 1000); + } catch (InterruptedException e) { + } } } } - } - }); - stage.timeoutChecker.start(); + }); + stage.timeoutChecker.start(); + } + } else { + stage.handle(new StageEvent(stage.getId(), StageEventType.SQ_STAGE_COMPLETED)); } } } catch (Throwable t) { diff --git a/tajo-core/src/main/java/org/apache/tajo/util/history/HistoryWriter.java b/tajo-core/src/main/java/org/apache/tajo/util/history/HistoryWriter.java index f0c6c1134f..e8ba3046b9 100644 --- a/tajo-core/src/main/java/org/apache/tajo/util/history/HistoryWriter.java +++ b/tajo-core/src/main/java/org/apache/tajo/util/history/HistoryWriter.java @@ -132,7 +132,7 @@ public WriterFuture appendHistory(History history) { } /* asynchronously flush to history file */ - public synchronized WriterFuture appendAndFlush(History history) { + public WriterFuture appendAndFlush(History history) { WriterFuture future = new WriterFuture(history) { public void done(WriterHolder holder) { try { @@ -163,7 +163,7 @@ public synchronized void appendAndSync(History history) } /* Flushing the buffer */ - public synchronized void flushTaskHistories() { + public void flushTaskHistories() { if (historyQueue.size() > 0) { synchronized (writerThread) { writerThread.needTaskFlush.set(true); @@ -244,20 +244,16 @@ public void run() { cal.add(Calendar.HOUR_OF_DAY, -2); String closeTargetTime = df.format(cal.getTime()); List closingTargets = new ArrayList(); - synchronized (taskWriters) { - for (String eachWriterTime : taskWriters.keySet()) { - if (eachWriterTime.compareTo(closeTargetTime) <= 0) { - closingTargets.add(eachWriterTime); - } + + for (String eachWriterTime : taskWriters.keySet()) { + if (eachWriterTime.compareTo(closeTargetTime) <= 0) { + closingTargets.add(eachWriterTime); } } for (String eachWriterTime : closingTargets) { WriterHolder writerHolder; - synchronized (taskWriters) { - writerHolder = taskWriters.remove(eachWriterTime); - } - + writerHolder = taskWriters.remove(eachWriterTime); if (writerHolder != null) { LOG.info("Closing task history file: " + writerHolder.path); IOUtils.cleanup(LOG, writerHolder); @@ -340,7 +336,7 @@ private List> writeHistory(List//query-detail//query.hist @@ -381,7 +377,7 @@ private synchronized void writeQueryHistory(QueryHistory queryHistory) throws Ex } } - private synchronized WriterHolder writeQuerySummary(QueryInfo queryInfo) throws Exception { + private WriterHolder writeQuerySummary(QueryInfo queryInfo) throws Exception { if(stopped.get()) return null; // writing to HDFS and rolling hourly @@ -409,7 +405,7 @@ private synchronized WriterHolder writeQuerySummary(QueryInfo queryInfo) throws return querySummaryWriter; } - private synchronized void rollingQuerySummaryWriter() throws Exception { + private void rollingQuerySummaryWriter() throws Exception { // finding largest file sequence SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss"); String currentDateTime = df.format(new Date(System.currentTimeMillis())); @@ -442,7 +438,7 @@ private void flushTaskHistories() { } } - private synchronized WriterHolder writeTaskHistory(TaskHistory taskHistory) throws Exception { + private WriterHolder writeTaskHistory(TaskHistory taskHistory) throws Exception { SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHH"); String taskStartTime = df.format(new Date(taskHistory.getStartTime())); @@ -536,14 +532,14 @@ static class WriterHolder implements Closeable { FSDataOutputStream out; @Override - public synchronized void close() throws IOException { + public void close() throws IOException { if (out != null) out.close(); } /* * Sync buffered data to DataNodes or disks (flush to disk devices). */ - private synchronized void flush() throws IOException { + private void flush() throws IOException { if (out != null) out.hsync(); } } diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/ExecutionBlockContext.java b/tajo-core/src/main/java/org/apache/tajo/worker/ExecutionBlockContext.java index a645689dcc..c2b63eb9da 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/ExecutionBlockContext.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/ExecutionBlockContext.java @@ -20,6 +20,7 @@ import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import io.netty.channel.ConnectTimeoutException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; @@ -34,18 +35,14 @@ import org.apache.tajo.engine.query.QueryContext; import org.apache.tajo.ipc.QueryMasterProtocol; import org.apache.tajo.master.cluster.WorkerConnectionInfo; +import org.apache.tajo.plan.serder.PlanProto; import org.apache.tajo.rpc.NettyClientBase; import org.apache.tajo.rpc.NullCallback; -import org.apache.tajo.rpc.RpcChannelFactory; import org.apache.tajo.rpc.RpcConnectionPool; import org.apache.tajo.storage.HashShuffleAppenderManager; import org.apache.tajo.storage.StorageUtil; import org.apache.tajo.util.NetUtils; import org.apache.tajo.util.Pair; -import org.apache.tajo.worker.event.TaskRunnerStartEvent; - -import io.netty.channel.ConnectTimeoutException; -import io.netty.channel.EventLoopGroup; import java.io.IOException; import java.net.InetSocketAddress; @@ -68,8 +65,6 @@ public class ExecutionBlockContext { public AtomicInteger killedTasksNum = new AtomicInteger(); public AtomicInteger failedTasksNum = new AtomicInteger(); - private EventLoopGroup loopGroup; - // for temporal or intermediate files private FileSystem localFS; // for input files private FileSystem defaultFS; @@ -92,6 +87,8 @@ public class ExecutionBlockContext { private AtomicBoolean stop = new AtomicBoolean(); + private PlanProto.ShuffleType shuffleType; + // It keeps all of the query unit attempts while a TaskRunner is running. private final ConcurrentMap tasks = Maps.newConcurrentMap(); @@ -99,7 +96,8 @@ public class ExecutionBlockContext { public ExecutionBlockContext(TajoConf conf, TajoWorker.WorkerContext workerContext, TaskRunnerManager manager, QueryContext queryContext, String plan, - ExecutionBlockId executionBlockId, WorkerConnectionInfo queryMaster) throws Throwable { + ExecutionBlockId executionBlockId, WorkerConnectionInfo queryMaster, + PlanProto.ShuffleType shuffleType) throws Throwable { this.manager = manager; this.executionBlockId = executionBlockId; this.connPool = RpcConnectionPool.getPool(); @@ -116,6 +114,7 @@ public ExecutionBlockContext(TajoConf conf, TajoWorker.WorkerContext workerConte this.plan = plan; this.resource = new ExecutionBlockSharedResource(); this.workerContext = workerContext; + this.shuffleType = shuffleType; } public void init() throws Throwable { @@ -195,10 +194,6 @@ public FileSystem getLocalFS() { return localFS; } - public FileSystem getDefaultFS() { - return defaultFS; - } - public LocalDirAllocator getLocalDirAllocator() { return workerContext.getLocalDirAllocator(); } @@ -266,11 +261,29 @@ public TajoWorker.WorkerContext getWorkerContext(){ return workerContext; } - private void sendExecutionBlockReport(ExecutionBlockReport reporter) throws Exception { - getQueryMasterStub().doneExecutionBlock(null, reporter, NullCallback.get()); + /** + * HASH_SHUFFLE, SCATTERED_HASH_SHUFFLE should send report when this executionBlock stopping. + */ + protected void sendShuffleReport() throws Exception { + + switch (shuffleType) { + case HASH_SHUFFLE: + case SCATTERED_HASH_SHUFFLE: + sendHashShuffleReport(executionBlockId); + break; + case NONE_SHUFFLE: + case RANGE_SHUFFLE: + default: + break; + } } - protected void reportExecutionBlock(ExecutionBlockId ebId) { + private void sendHashShuffleReport(ExecutionBlockId ebId) throws Exception { + /* This case is that worker did not ran tasks */ + if(completedTasksNum.get() == 0) return; + + QueryMasterProtocol.QueryMasterProtocolService.Interface stub = getQueryMasterStub(); + ExecutionBlockReport.Builder reporterBuilder = ExecutionBlockReport.newBuilder(); reporterBuilder.setEbId(ebId.getProto()); reporterBuilder.setReportSuccess(true); @@ -281,7 +294,7 @@ protected void reportExecutionBlock(ExecutionBlockId ebId) { getWorkerContext().getHashShuffleAppenderManager().close(ebId); if (shuffles == null) { reporterBuilder.addAllIntermediateEntries(intermediateEntries); - sendExecutionBlockReport(reporterBuilder.build()); + stub.doneExecutionBlock(null, reporterBuilder.build(), NullCallback.get()); return; } @@ -334,7 +347,7 @@ protected void reportExecutionBlock(ExecutionBlockId ebId) { } } try { - sendExecutionBlockReport(reporterBuilder.build()); + stub.doneExecutionBlock(null, reporterBuilder.build(), NullCallback.get()); } catch (Throwable e) { // can't send report to query master LOG.fatal(e.getMessage(), e); diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorkerManagerService.java b/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorkerManagerService.java index 4a097725fc..71d96c4825 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorkerManagerService.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorkerManagerService.java @@ -121,7 +121,8 @@ public void startExecutionBlock(RpcController controller, , new ExecutionBlockId(request.getExecutionBlockId()) , request.getContainerId() , new QueryContext(workerContext.getConf(), request.getQueryContext()), - request.getPlanJson() + request.getPlanJson(), + request.getShuffleType() )); done.run(TajoWorker.TRUE_PROTO); } catch (Throwable t) { diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/TaskRunnerManager.java b/tajo-core/src/main/java/org/apache/tajo/worker/TaskRunnerManager.java index 3f4a1b8404..955e4f33c0 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/TaskRunnerManager.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/TaskRunnerManager.java @@ -35,7 +35,6 @@ import org.apache.tajo.worker.event.TaskRunnerStartEvent; import org.apache.tajo.worker.event.TaskRunnerStopEvent; -import java.io.IOException; import java.util.*; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicBoolean; @@ -162,8 +161,14 @@ public void handle(TaskRunnerEvent event) { if(context == null){ try { - context = new ExecutionBlockContext(getTajoConf(), getWorkerContext(), this, startEvent.getQueryContext(), - startEvent.getPlan(), startEvent.getExecutionBlockId(), startEvent.getQueryMaster()); + context = new ExecutionBlockContext(getTajoConf(), + getWorkerContext(), + this, + startEvent.getQueryContext(), + startEvent.getPlan(), + startEvent.getExecutionBlockId(), + startEvent.getQueryMaster(), + startEvent.getShuffleType()); context.init(); } catch (Throwable e) { LOG.fatal(e.getMessage(), e); @@ -185,10 +190,9 @@ public void handle(TaskRunnerEvent event) { if(executionBlockContext != null){ try { TupleCache.getInstance().removeBroadcastCache(event.getExecutionBlockId()); - executionBlockContext.reportExecutionBlock(event.getExecutionBlockId()); - workerContext.getHashShuffleAppenderManager().close(event.getExecutionBlockId()); + executionBlockContext.sendShuffleReport(); workerContext.getTaskHistoryWriter().flushTaskHistories(); - } catch (IOException e) { + } catch (Exception e) { LOG.fatal(e.getMessage(), e); throw new RuntimeException(e); } finally { diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/event/TaskRunnerStartEvent.java b/tajo-core/src/main/java/org/apache/tajo/worker/event/TaskRunnerStartEvent.java index ff63754ceb..908afa2293 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/event/TaskRunnerStartEvent.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/event/TaskRunnerStartEvent.java @@ -21,6 +21,7 @@ import org.apache.tajo.ExecutionBlockId; import org.apache.tajo.engine.query.QueryContext; import org.apache.tajo.master.cluster.WorkerConnectionInfo; +import org.apache.tajo.plan.serder.PlanProto; public class TaskRunnerStartEvent extends TaskRunnerEvent { @@ -28,17 +29,20 @@ public class TaskRunnerStartEvent extends TaskRunnerEvent { private final WorkerConnectionInfo queryMaster; private final String containerId; private final String plan; + private final PlanProto.ShuffleType shuffleType; public TaskRunnerStartEvent(WorkerConnectionInfo queryMaster, ExecutionBlockId executionBlockId, String containerId, QueryContext context, - String plan) { + String plan, + PlanProto.ShuffleType shuffleType) { super(EventType.START, executionBlockId); this.queryMaster = queryMaster; this.containerId = containerId; this.queryContext = context; this.plan = plan; + this.shuffleType = shuffleType; } public WorkerConnectionInfo getQueryMaster() { @@ -56,4 +60,8 @@ public QueryContext getQueryContext() { public String getPlan() { return plan; } + + public PlanProto.ShuffleType getShuffleType() { + return shuffleType; + } } diff --git a/tajo-core/src/main/proto/TajoWorkerProtocol.proto b/tajo-core/src/main/proto/TajoWorkerProtocol.proto index b8c9575b35..fddef8fa81 100644 --- a/tajo-core/src/main/proto/TajoWorkerProtocol.proto +++ b/tajo-core/src/main/proto/TajoWorkerProtocol.proto @@ -201,6 +201,7 @@ message RunExecutionBlockRequestProto { required KeyValueSetProto queryContext = 6; required string planJson = 7; + required ShuffleType shuffleType = 8; } message ExecutionBlockListProto { diff --git a/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java b/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java index 09be700dbc..b2e1ce9587 100644 --- a/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java +++ b/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java @@ -211,7 +211,8 @@ public void testKillTask() throws Throwable { taskRequest.setInterQuery(); TaskAttemptId attemptId = new TaskAttemptId(tid, 1); - ExecutionBlockContext context = new ExecutionBlockContext(conf, null, null, new QueryContext(conf), null, eid, null); + ExecutionBlockContext context = + new ExecutionBlockContext(conf, null, null, new QueryContext(conf), null, eid, null, null); org.apache.tajo.worker.Task task = new Task("test", CommonTestingUtil.getTestDir(), attemptId, conf, context, taskRequest); From 4a02456d3e93a3630931066af60d17a61e28638a Mon Sep 17 00:00:00 2001 From: "navis.ryu" Date: Mon, 20 Apr 2015 10:34:25 +0900 Subject: [PATCH 016/141] TAJO-1522: NPE making stage history before task scheduler is initialized. Signed-off-by: Jinho Kim --- CHANGES | 3 +++ .../src/main/java/org/apache/tajo/querymaster/Stage.java | 8 ++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/CHANGES b/CHANGES index ad2f0d3452..fd133d5f86 100644 --- a/CHANGES +++ b/CHANGES @@ -27,6 +27,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1522: NPE making stage history before task scheduler is initialized. + (Contributed by navis, Committed by jinho) + TAJO-1560: HashShuffle report should be ignored when a succeed tasks are not included. (jinho) diff --git a/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java b/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java index c344682fae..80ccc213e2 100644 --- a/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java +++ b/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java @@ -469,8 +469,12 @@ private StageHistory makeStageHistory() { stageHistory.setKilledObjectCount(killedObjectCount); stageHistory.setFailedObjectCount(failedObjectCount); stageHistory.setTotalScheduledObjectsCount(totalScheduledObjectsCount); - stageHistory.setHostLocalAssigned(getTaskScheduler().getHostLocalAssigned()); - stageHistory.setRackLocalAssigned(getTaskScheduler().getRackLocalAssigned()); + + AbstractTaskScheduler scheduler = getTaskScheduler(); + if (scheduler != null) { + stageHistory.setHostLocalAssigned(scheduler.getHostLocalAssigned()); + stageHistory.setRackLocalAssigned(scheduler.getRackLocalAssigned()); + } long totalInputBytes = 0; long totalReadBytes = 0; From 5e1fa93b53cc4b575996a4aceaeb781567dc47d6 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Mon, 20 Apr 2015 11:13:14 +0900 Subject: [PATCH 017/141] TAJO-1568: Apply UnpooledByteBufAllocator when a tajo.test.enabled is set to enable. --- CHANGES | 3 + .../java/org/apache/tajo/util/NumberUtil.java | 26 +++---- .../org/apache/tajo/QueryTestCaseBase.java | 4 +- .../org/apache/tajo/storage/BufferPool.java | 67 ++++++++++++++++--- 4 files changed, 79 insertions(+), 21 deletions(-) diff --git a/CHANGES b/CHANGES index fd133d5f86..806da48b8f 100644 --- a/CHANGES +++ b/CHANGES @@ -83,6 +83,9 @@ Release 0.10.1 - unreleased TASKS + TAJO-1568: Apply UnpooledByteBufAllocator when a tajo.test.enabled + is set to enable. (jinho) + TAJO-1567: Update old license in some pom.xml files. (Contributed by Dongjoon Hyun, Committed by jinho) diff --git a/tajo-common/src/main/java/org/apache/tajo/util/NumberUtil.java b/tajo-common/src/main/java/org/apache/tajo/util/NumberUtil.java index 9e16cecd52..0d70cc2034 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/NumberUtil.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/NumberUtil.java @@ -604,14 +604,14 @@ public static double parseDouble(ByteBuf bytes) { * @throws NumberFormatException if the argument could not be parsed as a double */ public static double parseDouble(ByteBuf bytes, int start, int length) { - if (!PlatformDependent.hasUnsafe()) { - return parseDouble(bytes.array(), start, length); - } - if (bytes == null) { throw new NumberFormatException("String is null"); } + if (!bytes.hasMemoryAddress()) { + return parseDouble(bytes.array(), start, length); + } + if (length == 0 || bytes.writerIndex() < start + length) { throw new NumberFormatException("Empty string or Invalid buffer!"); } @@ -815,13 +815,14 @@ public static int parseInt(ByteBuf bytes, int start, int length) { * @throws NumberFormatException if the argument could not be parsed as an int quantity. */ public static int parseInt(ByteBuf bytes, int start, int length, int radix) { - if (!PlatformDependent.hasUnsafe()) { - return parseInt(bytes.array(), start, length); - } - if (bytes == null) { throw new NumberFormatException("String is null"); } + + if (!bytes.hasMemoryAddress()) { + return parseInt(bytes.array(), start, length); + } + if (radix < Character.MIN_RADIX || radix > Character.MAX_RADIX) { throw new NumberFormatException("Invalid radix: " + radix); } @@ -942,13 +943,14 @@ public static long parseLong(ByteBuf bytes, int start, int length) { * @throws NumberFormatException if the argument could not be parsed as an long quantity. */ public static long parseLong(ByteBuf bytes, int start, int length, int radix) { - if (!PlatformDependent.hasUnsafe()) { - return parseInt(bytes.array(), start, length); - } - if (bytes == null) { throw new NumberFormatException("String is null"); } + + if (!bytes.hasMemoryAddress()) { + return parseInt(bytes.array(), start, length); + } + if (radix < Character.MIN_RADIX || radix > Character.MAX_RADIX) { throw new NumberFormatException("Invalid radix: " + radix); } diff --git a/tajo-core/src/test/java/org/apache/tajo/QueryTestCaseBase.java b/tajo-core/src/test/java/org/apache/tajo/QueryTestCaseBase.java index 15fbdaefd7..ddfa7a67fb 100644 --- a/tajo-core/src/test/java/org/apache/tajo/QueryTestCaseBase.java +++ b/tajo-core/src/test/java/org/apache/tajo/QueryTestCaseBase.java @@ -212,7 +212,9 @@ public static void tearDownClass() throws ServiceException { @Before public void printTestName() { /* protect a travis stalled build */ - System.out.println("Run: " + name.getMethodName()); + System.out.println("Run: " + name.getMethodName() + + " Used memory: " + ((Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) + / (1024 * 1024)) + "MBytes"); } public QueryTestCaseBase() { diff --git a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BufferPool.java b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BufferPool.java index 85c79fa17e..d611ee378f 100644 --- a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BufferPool.java +++ b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BufferPool.java @@ -19,24 +19,75 @@ package org.apache.tajo.storage; import io.netty.buffer.ByteBuf; +import io.netty.buffer.ByteBufAllocator; import io.netty.buffer.PooledByteBufAllocator; +import io.netty.buffer.UnpooledByteBufAllocator; +import io.netty.util.ResourceLeakDetector; import io.netty.util.internal.PlatformDependent; import org.apache.hadoop.classification.InterfaceStability; +import org.apache.tajo.conf.TajoConf; +import org.apache.tajo.util.CommonTestingUtil; + +import java.lang.reflect.Field; /* this class is PooledBuffer holder */ public class BufferPool { - private static final PooledByteBufAllocator allocator; + public static final String ALLOW_CACHE = "tajo.storage.buffer.thread-local.cache"; + private static final ByteBufAllocator ALLOCATOR; private BufferPool() { } static { - //TODO we need determine the default params - allocator = new PooledByteBufAllocator(PlatformDependent.directBufferPreferred()); + /* TODO Enable thread cache + * Create a pooled ByteBuf allocator but disables the thread-local cache. + * Because the TaskRunner thread is newly created + * */ + + if (System.getProperty(CommonTestingUtil.TAJO_TEST_KEY, "FALSE").equalsIgnoreCase("TRUE")) { + /* Disable pooling buffers for memory usage */ + ALLOCATOR = UnpooledByteBufAllocator.DEFAULT; + + /* if you are finding memory leak, please enable this line */ + ResourceLeakDetector.setLevel(ResourceLeakDetector.Level.ADVANCED); + } else { + TajoConf tajoConf = new TajoConf(); + ALLOCATOR = createPooledByteBufAllocator(true, tajoConf.getBoolean(ALLOW_CACHE, false), 0); + } + } + + /** + * borrowed from Spark + */ + public static PooledByteBufAllocator createPooledByteBufAllocator( + boolean allowDirectBufs, + boolean allowCache, + int numCores) { + if (numCores == 0) { + numCores = Runtime.getRuntime().availableProcessors(); + } + return new PooledByteBufAllocator( + allowDirectBufs && PlatformDependent.directBufferPreferred(), + Math.min(getPrivateStaticField("DEFAULT_NUM_HEAP_ARENA"), numCores), + Math.min(getPrivateStaticField("DEFAULT_NUM_DIRECT_ARENA"), allowDirectBufs ? numCores : 0), + getPrivateStaticField("DEFAULT_PAGE_SIZE"), + getPrivateStaticField("DEFAULT_MAX_ORDER"), + allowCache ? getPrivateStaticField("DEFAULT_TINY_CACHE_SIZE") : 0, + allowCache ? getPrivateStaticField("DEFAULT_SMALL_CACHE_SIZE") : 0, + allowCache ? getPrivateStaticField("DEFAULT_NORMAL_CACHE_SIZE") : 0 + ); + } - /* if you are finding memory leak, please enable this line */ - //ResourceLeakDetector.setLevel(ResourceLeakDetector.Level.ADVANCED); + /** Used to get defaults from Netty's private static fields. */ + private static int getPrivateStaticField(String name) { + try { + Field f = PooledByteBufAllocator.DEFAULT.getClass().getDeclaredField(name); + f.setAccessible(true); + return f.getInt(null); + } catch (Exception e) { + throw new RuntimeException(e); + } } public static long maxDirectMemory() { @@ -44,8 +95,8 @@ public static long maxDirectMemory() { } - public synchronized static ByteBuf directBuffer(int size) { - return allocator.directBuffer(size); + public static ByteBuf directBuffer(int size) { + return ALLOCATOR.directBuffer(size); } /** @@ -55,7 +106,7 @@ public synchronized static ByteBuf directBuffer(int size) { * @return allocated ByteBuf from pool */ public static ByteBuf directBuffer(int size, int max) { - return allocator.directBuffer(size, max); + return ALLOCATOR.directBuffer(size, max); } @InterfaceStability.Unstable From 47008c58ea866a9609f56405b09f968665c66d47 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Mon, 20 Apr 2015 14:12:09 +0900 Subject: [PATCH 018/141] TAJO-1571: Merge TAJO-1497 and TAJO-1569 to 0.10.1. (jinho) Closes #544 --- CHANGES | 2 + .../tajo/catalog/AbstractCatalogClient.java | 84 ++++---- .../tajo/client/CatalogAdminClientImpl.java | 40 ++-- .../apache/tajo/client/QueryClientImpl.java | 50 ++--- .../apache/tajo/client/SessionConnection.java | 44 ++-- .../apache/tajo/master/QueryInProgress.java | 6 +- .../tajo/master/TajoContainerProxy.java | 37 ++-- .../apache/tajo/querymaster/QueryMaster.java | 24 +-- .../tajo/worker/ExecutionBlockContext.java | 15 +- .../tajo/worker/TajoResourceAllocator.java | 19 +- .../tajo/worker/WorkerHeartbeatService.java | 16 +- .../ConnectivityCheckerRuleForTajoWorker.java | 26 +-- .../org/apache/tajo/rpc/AsyncRpcClient.java | 128 +++++------- .../org/apache/tajo/rpc/AsyncRpcServer.java | 82 +++----- .../apache/tajo/rpc/BlockingRpcClient.java | 157 ++++++-------- .../apache/tajo/rpc/BlockingRpcServer.java | 85 +++----- .../rpc/ConnectionCloseFutureListener.java | 35 ++++ .../org/apache/tajo/rpc/NettyClientBase.java | 190 +++++++++-------- .../tajo/rpc/ProtoChannelInitializer.java | 11 +- .../org/apache/tajo/rpc/RpcClientManager.java | 185 +++++++++++++++++ .../apache/tajo/rpc/RpcConnectionPool.java | 194 ------------------ .../org/apache/tajo/rpc/ServerCallable.java | 36 +--- .../org/apache/tajo/rpc/TestAsyncRpc.java | 90 ++++++-- .../org/apache/tajo/rpc/TestBlockingRpc.java | 149 ++++++++++---- .../apache/tajo/rpc/TestRpcClientManager.java | 97 +++++++++ 25 files changed, 946 insertions(+), 856 deletions(-) create mode 100644 tajo-rpc/src/main/java/org/apache/tajo/rpc/ConnectionCloseFutureListener.java create mode 100644 tajo-rpc/src/main/java/org/apache/tajo/rpc/RpcClientManager.java delete mode 100644 tajo-rpc/src/main/java/org/apache/tajo/rpc/RpcConnectionPool.java create mode 100644 tajo-rpc/src/test/java/org/apache/tajo/rpc/TestRpcClientManager.java diff --git a/CHANGES b/CHANGES index 806da48b8f..e9b5886a0b 100644 --- a/CHANGES +++ b/CHANGES @@ -83,6 +83,8 @@ Release 0.10.1 - unreleased TASKS + TAJO-1571: Merge TAJO-1497 and TAJO-1569 to 0.10.1. (jinho) + TAJO-1568: Apply UnpooledByteBufAllocator when a tajo.test.enabled is set to enable. (jinho) diff --git a/tajo-catalog/tajo-catalog-client/src/main/java/org/apache/tajo/catalog/AbstractCatalogClient.java b/tajo-catalog/tajo-catalog-client/src/main/java/org/apache/tajo/catalog/AbstractCatalogClient.java index d8350a38ff..bdb8c2cc6d 100644 --- a/tajo-catalog/tajo-catalog-client/src/main/java/org/apache/tajo/catalog/AbstractCatalogClient.java +++ b/tajo-catalog/tajo-catalog-client/src/main/java/org/apache/tajo/catalog/AbstractCatalogClient.java @@ -30,7 +30,7 @@ import org.apache.tajo.common.TajoDataTypes.DataType; import org.apache.tajo.conf.TajoConf; import org.apache.tajo.rpc.NettyClientBase; -import org.apache.tajo.rpc.RpcConnectionPool; +import org.apache.tajo.rpc.RpcClientManager; import org.apache.tajo.rpc.ServerCallable; import org.apache.tajo.rpc.protocolrecords.PrimitiveProtos; import org.apache.tajo.rpc.protocolrecords.PrimitiveProtos.NullProto; @@ -50,14 +50,14 @@ public abstract class AbstractCatalogClient implements CatalogService { private final Log LOG = LogFactory.getLog(AbstractCatalogClient.class); protected ServiceTracker serviceTracker; - protected RpcConnectionPool pool; + protected RpcClientManager manager; protected InetSocketAddress catalogServerAddr; protected TajoConf conf; abstract CatalogProtocolService.BlockingInterface getStub(NettyClientBase client); public AbstractCatalogClient(TajoConf conf, InetSocketAddress catalogServerAddr) { - this.pool = RpcConnectionPool.getPool(); + this.manager = RpcClientManager.getInstance(); this.catalogServerAddr = catalogServerAddr; this.serviceTracker = ServiceTrackerFactory.get(conf); this.conf = conf; @@ -79,7 +79,7 @@ private InetSocketAddress getCatalogServerAddr() { @Override public final Boolean createTablespace(final String tablespaceName, final String tablespaceUri) { try { - return new ServerCallable(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); @@ -98,7 +98,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public final Boolean dropTablespace(final String tablespaceName) { try { - return new ServerCallable(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.dropTablespace(null, ProtoUtil.convertString(tablespaceName)).getValue(); @@ -113,7 +113,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public final Boolean existTablespace(final String tablespaceName) { try { - return new ServerCallable(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.existTablespace(null, ProtoUtil.convertString(tablespaceName)).getValue(); @@ -128,7 +128,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public final Collection getAllTablespaceNames() { try { - return new ServerCallable>(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Collection call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); PrimitiveProtos.StringListProto response = stub.getAllTablespaceNames(null, ProtoUtil.NULL_PROTO); @@ -144,7 +144,7 @@ public Collection call(NettyClientBase client) throws ServiceException { @Override public List getAllTablespaces() { try { - return new ServerCallable>(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { @Override public List call(NettyClientBase client) throws Exception { @@ -162,7 +162,7 @@ public List call(NettyClientBase client) throws Exception { @Override public TablespaceProto getTablespace(final String tablespaceName) { try { - return new ServerCallable(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public TablespaceProto call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.getTablespace(null, ProtoUtil.convertString(tablespaceName)); @@ -177,7 +177,7 @@ public TablespaceProto call(NettyClientBase client) throws ServiceException { @Override public Boolean alterTablespace(final AlterTablespaceProto alterTablespace) { try { - return new ServerCallable(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.alterTablespace(null, alterTablespace).getValue(); @@ -192,7 +192,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public final Boolean createDatabase(final String databaseName, @Nullable final String tablespaceName) { try { - return new ServerCallable(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); @@ -213,7 +213,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public final Boolean dropDatabase(final String databaseName) { try { - return new ServerCallable(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.dropDatabase(null, ProtoUtil.convertString(databaseName)).getValue(); @@ -228,7 +228,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public final Boolean existDatabase(final String databaseName) { try { - return new ServerCallable(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.existDatabase(null, ProtoUtil.convertString(databaseName)).getValue(); @@ -243,7 +243,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public final Collection getAllDatabaseNames() { try { - return new ServerCallable>(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Collection call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); PrimitiveProtos.StringListProto response = stub.getAllDatabaseNames(null, ProtoUtil.NULL_PROTO); @@ -259,7 +259,7 @@ public Collection call(NettyClientBase client) throws ServiceException { @Override public List getAllDatabases() { try { - return new ServerCallable>(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { @Override public List call(NettyClientBase client) throws Exception { @@ -277,7 +277,7 @@ public List call(NettyClientBase client) throws Exception { @Override public final TableDesc getTableDesc(final String databaseName, final String tableName) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public TableDesc call(NettyClientBase client) throws ServiceException { TableIdentifierProto.Builder builder = TableIdentifierProto.newBuilder(); builder.setDatabaseName(databaseName); @@ -302,7 +302,7 @@ public TableDesc getTableDesc(String qualifiedName) { @Override public List getAllTables() { try { - return new ServerCallable>(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { @Override public List call(NettyClientBase client) throws Exception { @@ -320,7 +320,7 @@ public List call(NettyClientBase client) throws Exception @Override public List getAllTableOptions() { try { - return new ServerCallable>(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { @Override public List call(NettyClientBase client) throws Exception { @@ -338,7 +338,7 @@ public List call(NettyClientBase client) throws Exception { @Override public List getAllTableStats() { try { - return new ServerCallable>(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { @Override public List call(NettyClientBase client) throws Exception { @@ -356,7 +356,7 @@ public List call(NettyClientBase client) throws Exception { @Override public List getAllColumns() { try { - return new ServerCallable>(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { @Override public List call(NettyClientBase client) throws Exception { @@ -374,7 +374,7 @@ public List call(NettyClientBase client) throws Exception { @Override public final PartitionMethodDesc getPartitionMethod(final String databaseName, final String tableName) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public PartitionMethodDesc call(NettyClientBase client) throws ServiceException { TableIdentifierProto.Builder builder = TableIdentifierProto.newBuilder(); @@ -382,7 +382,7 @@ public PartitionMethodDesc call(NettyClientBase client) throws ServiceException builder.setTableName(tableName); CatalogProtocolService.BlockingInterface stub = getStub(client); - return CatalogUtil.newPartitionMethodDesc(stub.getPartitionMethodByTableName(null, builder.build())); + return CatalogUtil.newPartitionMethodDesc(stub.getPartitionMethodByTableName(null, builder.build())); } }.withRetries(); } catch (ServiceException e) { @@ -394,7 +394,7 @@ public PartitionMethodDesc call(NettyClientBase client) throws ServiceException @Override public final boolean existPartitionMethod(final String databaseName, final String tableName) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { TableIdentifierProto.Builder builder = TableIdentifierProto.newBuilder(); @@ -414,7 +414,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public List getAllPartitions() { try { - return new ServerCallable>(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { @Override public List call(NettyClientBase client) throws Exception { @@ -432,7 +432,7 @@ public List call(NettyClientBase client) throws Exception { @Override public final Collection getAllTableNames(final String databaseName) { try { - return new ServerCallable>(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Collection call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); PrimitiveProtos.StringListProto response = stub.getAllTableNames(null, ProtoUtil.convertString(databaseName)); @@ -448,7 +448,7 @@ public Collection call(NettyClientBase client) throws ServiceException { @Override public final Collection getFunctions() { try { - return new ServerCallable>(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Collection call(NettyClientBase client) throws ServiceException { List list = new ArrayList(); GetFunctionsResponse response; @@ -475,7 +475,7 @@ public Collection call(NettyClientBase client) throws ServiceExcep @Override public final boolean createTable(final TableDesc desc) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.createTable(null, desc.getProto()).getValue(); @@ -494,7 +494,7 @@ public boolean dropTable(String tableName) { final String simpleName = splitted[1]; try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { TableIdentifierProto.Builder builder = TableIdentifierProto.newBuilder(); @@ -518,7 +518,7 @@ public final boolean existsTable(final String databaseName, final String tableNa "tableName cannot be composed of multiple parts, but it is \"" + tableName + "\""); } try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { TableIdentifierProto.Builder builder = TableIdentifierProto.newBuilder(); @@ -543,7 +543,7 @@ public final boolean existsTable(final String tableName) { @Override public final boolean createIndex(final IndexDesc index) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.createIndex(null, index.getProto()).getValue(); @@ -558,7 +558,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public final boolean existIndexByName(final String databaseName, final String indexName) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { IndexNameProto.Builder builder = IndexNameProto.newBuilder(); builder.setDatabaseName(databaseName); @@ -577,7 +577,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public boolean existIndexByColumn(final String databaseName, final String tableName, final String columnName) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { GetIndexByColumnRequest.Builder builder = GetIndexByColumnRequest.newBuilder(); @@ -597,7 +597,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public final IndexDesc getIndexByName(final String databaseName, final String indexName) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public IndexDesc call(NettyClientBase client) throws ServiceException { IndexNameProto.Builder builder = IndexNameProto.newBuilder(); @@ -619,7 +619,7 @@ public final IndexDesc getIndexByColumn(final String databaseName, final String tableName, final String columnName) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public IndexDesc call(NettyClientBase client) throws ServiceException { GetIndexByColumnRequest.Builder builder = GetIndexByColumnRequest.newBuilder(); @@ -640,7 +640,7 @@ public IndexDesc call(NettyClientBase client) throws ServiceException { public boolean dropIndex(final String databaseName, final String indexName) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { IndexNameProto.Builder builder = IndexNameProto.newBuilder(); @@ -660,7 +660,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public List getAllIndexes() { try { - return new ServerCallable>(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable>(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { @Override public List call(NettyClientBase client) throws Exception { @@ -678,7 +678,7 @@ public List call(NettyClientBase client) throws Exception { @Override public final boolean createFunction(final FunctionDesc funcDesc) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.createFunction(null, funcDesc.getProto()).getValue(); @@ -693,7 +693,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public final boolean dropFunction(final String signature) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { UnregisterFunctionRequest.Builder builder = UnregisterFunctionRequest.newBuilder(); builder.setSignature(signature); @@ -726,7 +726,7 @@ public final FunctionDesc getFunction(final String signature, FunctionType funcT FunctionDescProto descProto = null; try { - descProto = new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + descProto = new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public FunctionDescProto call(NettyClientBase client) throws ServiceException { try { CatalogProtocolService.BlockingInterface stub = getStub(client); @@ -776,7 +776,7 @@ public final boolean containFunction(final String signature, FunctionType funcTy } try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.containFunction(null, builder.build()).getValue(); @@ -791,7 +791,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public final boolean alterTable(final AlterTableDesc desc) { try { - return new ServerCallable(this.pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(this.manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.alterTable(null, desc.getProto()).getValue(); @@ -806,7 +806,7 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public boolean updateTableStats(final UpdateTableStatsProto updateTableStatsProto) { try { - return new ServerCallable(pool, getCatalogServerAddr(), CatalogProtocol.class, false) { + return new ServerCallable(manager, getCatalogServerAddr(), CatalogProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CatalogProtocolService.BlockingInterface stub = getStub(client); return stub.updateTableStats(null, updateTableStatsProto).getValue(); diff --git a/tajo-client/src/main/java/org/apache/tajo/client/CatalogAdminClientImpl.java b/tajo-client/src/main/java/org/apache/tajo/client/CatalogAdminClientImpl.java index 6347ad174b..9d0e4271f5 100644 --- a/tajo-client/src/main/java/org/apache/tajo/client/CatalogAdminClientImpl.java +++ b/tajo-client/src/main/java/org/apache/tajo/client/CatalogAdminClientImpl.java @@ -48,8 +48,8 @@ public CatalogAdminClientImpl(SessionConnection connection) { @Override public boolean createDatabase(final String databaseName) throws ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { @@ -64,8 +64,8 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public boolean existDatabase(final String databaseName) throws ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { @@ -80,8 +80,8 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public boolean dropDatabase(final String databaseName) throws ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { @@ -96,8 +96,8 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public List getAllDatabaseNames() throws ServiceException { - return new ServerCallable>(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable>(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public List call(NettyClientBase client) throws ServiceException { @@ -111,8 +111,8 @@ public List call(NettyClientBase client) throws ServiceException { public boolean existTable(final String tableName) throws ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { connection.checkSessionAndGet(client); @@ -133,8 +133,8 @@ public TableDesc createExternalTable(final String tableName, final Schema schema final TableMeta meta, final PartitionMethodDesc partitionMethodDesc) throws SQLException, ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public TableDesc call(NettyClientBase client) throws ServiceException, SQLException { @@ -169,8 +169,8 @@ public boolean dropTable(String tableName) throws ServiceException { @Override public boolean dropTable(final String tableName, final boolean purge) throws ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { @@ -190,8 +190,8 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public List getTableList(@Nullable final String databaseName) throws ServiceException { - return new ServerCallable>(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable>(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public List call(NettyClientBase client) throws ServiceException { @@ -213,8 +213,8 @@ public List call(NettyClientBase client) throws ServiceException { @Override public TableDesc getTableDesc(final String tableName) throws ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public TableDesc call(NettyClientBase client) throws ServiceException, SQLException { @@ -238,8 +238,8 @@ public TableDesc call(NettyClientBase client) throws ServiceException, SQLExcept @Override public List getFunctions(final String functionName) throws ServiceException { - return new ServerCallable>(connection.connPool, - connection.getTajoMasterAddr(), TajoMasterClientProtocol.class, false, true) { + return new ServerCallable>(connection.manager, + connection.getTajoMasterAddr(), TajoMasterClientProtocol.class, false) { public List call(NettyClientBase client) throws ServiceException, SQLException { diff --git a/tajo-client/src/main/java/org/apache/tajo/client/QueryClientImpl.java b/tajo-client/src/main/java/org/apache/tajo/client/QueryClientImpl.java index 4444a31c85..99c58b6b21 100644 --- a/tajo-client/src/main/java/org/apache/tajo/client/QueryClientImpl.java +++ b/tajo-client/src/main/java/org/apache/tajo/client/QueryClientImpl.java @@ -19,7 +19,6 @@ package org.apache.tajo.client; import com.google.protobuf.ServiceException; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tajo.*; @@ -33,7 +32,6 @@ import org.apache.tajo.jdbc.FetchResultSet; import org.apache.tajo.jdbc.TajoMemoryResultSet; import org.apache.tajo.rpc.NettyClientBase; -import org.apache.tajo.rpc.RpcChannelFactory; import org.apache.tajo.rpc.ServerCallable; import org.apache.tajo.util.ProtoUtil; @@ -115,8 +113,6 @@ public void closeNonForwardQuery(QueryId queryId) { tajoMaster.closeNonForwardQuery(null, builder.build()); } catch (Exception e) { LOG.warn("Fail to close a TajoMaster connection (qid=" + queryId + ", msg=" + e.getMessage() + ")", e); - } finally { - connection.connPool.closeConnection(tmClient); } } @@ -158,8 +154,8 @@ public Map getAllSessionVariables() throws ServiceException { @Override public ClientProtos.SubmitQueryResponse executeQuery(final String sql) throws ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public ClientProtos.SubmitQueryResponse call(NettyClientBase client) throws ServiceException { @@ -184,8 +180,8 @@ public ClientProtos.SubmitQueryResponse call(NettyClientBase client) throws Serv @Override public ClientProtos.SubmitQueryResponse executeQueryWithJson(final String json) throws ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public ClientProtos.SubmitQueryResponse call(NettyClientBase client) throws ServiceException { @@ -321,8 +317,6 @@ public QueryStatus getQueryStatus(QueryId queryId) throws ServiceException { } catch (Exception e) { throw new ServiceException(e.getMessage(), e); - } finally { - connection.connPool.releaseConnection(tmClient); } return new QueryStatus(res); } @@ -367,8 +361,6 @@ public GetQueryResultResponse getResultResponse(QueryId queryId) throws ServiceE } catch (Exception e) { throw new ServiceException(e.getMessage(), e); - } finally { - connection.connPool.releaseConnection(tmClient); } } @@ -378,8 +370,8 @@ public TajoMemoryResultSet fetchNextQueryResult(final QueryId queryId, final int try { final ServerCallable callable = - new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public ClientProtos.SerializedResultSet call(NettyClientBase client) throws ServiceException { @@ -424,8 +416,8 @@ public ClientProtos.SerializedResultSet call(NettyClientBase client) throws Serv @Override public boolean updateQuery(final String sql) throws ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { @@ -454,8 +446,8 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public boolean updateQueryWithJson(final String json) throws ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { @@ -482,8 +474,8 @@ public Boolean call(NettyClientBase client) throws ServiceException { @Override public List getRunningQueryList() throws ServiceException { - return new ServerCallable>(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable>(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public List call(NettyClientBase client) throws ServiceException { @@ -502,8 +494,8 @@ public List call(NettyClientBase client) throws Ser @Override public List getFinishedQueryList() throws ServiceException { - return new ServerCallable>(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable>(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public List call(NettyClientBase client) throws ServiceException { @@ -522,8 +514,8 @@ public List call(NettyClientBase client) throws Ser @Override public List getClusterInfo() throws ServiceException { - return new ServerCallable>(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable>(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public List call(NettyClientBase client) throws ServiceException { @@ -574,8 +566,6 @@ public QueryStatus killQuery(final QueryId queryId) } catch(Exception e) { LOG.debug("Error when checking for application status", e); - } finally { - connection.connPool.releaseConnection(tmClient); } return status; } @@ -591,8 +581,8 @@ public int getMaxRows() { } public QueryInfoProto getQueryInfo(final QueryId queryId) throws ServiceException { - return new ServerCallable(connection.connPool, connection.getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, connection.getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public QueryInfoProto call(NettyClientBase client) throws ServiceException { connection.checkSessionAndGet(client); @@ -621,8 +611,8 @@ public QueryHistoryProto getQueryHistory(final QueryId queryId) throws ServiceEx InetSocketAddress qmAddress = new InetSocketAddress( queryInfo.getHostNameOfQM(), queryInfo.getQueryMasterClientPort()); - return new ServerCallable(connection.connPool, qmAddress, - QueryMasterClientProtocol.class, false, true) { + return new ServerCallable(connection.manager, qmAddress, + QueryMasterClientProtocol.class, false) { public QueryHistoryProto call(NettyClientBase client) throws ServiceException { connection.checkSessionAndGet(client); diff --git a/tajo-client/src/main/java/org/apache/tajo/client/SessionConnection.java b/tajo-client/src/main/java/org/apache/tajo/client/SessionConnection.java index d05d3b1e33..b0cc662225 100644 --- a/tajo-client/src/main/java/org/apache/tajo/client/SessionConnection.java +++ b/tajo-client/src/main/java/org/apache/tajo/client/SessionConnection.java @@ -30,7 +30,7 @@ import org.apache.tajo.ipc.ClientProtos.SessionUpdateResponse; import org.apache.tajo.ipc.TajoMasterClientProtocol; import org.apache.tajo.rpc.NettyClientBase; -import org.apache.tajo.rpc.RpcConnectionPool; +import org.apache.tajo.rpc.RpcClientManager; import org.apache.tajo.rpc.ServerCallable; import org.apache.tajo.service.ServiceTracker; import org.apache.tajo.util.KeyValueSet; @@ -55,7 +55,7 @@ public class SessionConnection implements Closeable { private final Log LOG = LogFactory.getLog(TajoClientImpl.class); - final RpcConnectionPool connPool; + final RpcClientManager manager; private final String baseDatabase; @@ -86,8 +86,8 @@ public SessionConnection(ServiceTracker tracker, @Nullable String baseDatabase, this.properties = properties; - connPool = RpcConnectionPool.getPool(); - userInfo = UserRoleInfo.getCurrentUser(); + this.manager = RpcClientManager.getInstance(); + this.userInfo = UserRoleInfo.getCurrentUser(); this.baseDatabase = baseDatabase != null ? baseDatabase : null; this.serviceTracker = tracker; @@ -99,12 +99,12 @@ public Map getClientSideSessionVars() { public NettyClientBase getTajoMasterConnection(boolean asyncMode) throws NoSuchMethodException, ConnectTimeoutException, ClassNotFoundException { - return connPool.getConnection(getTajoMasterAddr(), TajoMasterClientProtocol.class, asyncMode); + return manager.getClient(getTajoMasterAddr(), TajoMasterClientProtocol.class, asyncMode); } public NettyClientBase getConnection(InetSocketAddress addr, Class protocolClass, boolean asyncMode) throws NoSuchMethodException, ConnectTimeoutException, ClassNotFoundException { - return connPool.getConnection(addr, protocolClass, asyncMode); + return manager.getClient(addr, protocolClass, asyncMode); } protected KeyValueSet getProperties() { @@ -127,8 +127,8 @@ public String getBaseDatabase() { public boolean isConnected() { if(!closed.get()){ try { - return connPool.getConnection(serviceTracker.getClientServiceAddress(), - TajoMasterClientProtocol.class, false).isActive(); + return manager.getClient(serviceTracker.getClientServiceAddress(), + TajoMasterClientProtocol.class, false).isConnected(); } catch (Throwable e) { return false; } @@ -141,7 +141,7 @@ public UserRoleInfo getUserInfo() { } public String getCurrentDatabase() throws ServiceException { - return new ServerCallable(connPool, getTajoMasterAddr(), TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(manager, getTajoMasterAddr(), TajoMasterClientProtocol.class, false) { public String call(NettyClientBase client) throws ServiceException { checkSessionAndGet(client); @@ -153,8 +153,8 @@ public String call(NettyClientBase client) throws ServiceException { } public Map updateSessionVariables(final Map variables) throws ServiceException { - return new ServerCallable>(connPool, getTajoMasterAddr(), - TajoMasterClientProtocol.class, false, true) { + return new ServerCallable>(manager, getTajoMasterAddr(), + TajoMasterClientProtocol.class, false) { public Map call(NettyClientBase client) throws ServiceException { checkSessionAndGet(client); @@ -179,7 +179,7 @@ public Map call(NettyClientBase client) throws ServiceException } public Map unsetSessionVariables(final List variables) throws ServiceException { - return new ServerCallable>(connPool, getTajoMasterAddr(), TajoMasterClientProtocol.class, false, true) { + return new ServerCallable>(manager, getTajoMasterAddr(), TajoMasterClientProtocol.class, false) { public Map call(NettyClientBase client) throws ServiceException { checkSessionAndGet(client); @@ -209,7 +209,7 @@ void updateSessionVarsCache(Map variables) { } public String getSessionVariable(final String varname) throws ServiceException { - return new ServerCallable(connPool, getTajoMasterAddr(), TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(manager, getTajoMasterAddr(), TajoMasterClientProtocol.class, false) { public String call(NettyClientBase client) throws ServiceException { @@ -229,7 +229,7 @@ public String call(NettyClientBase client) throws ServiceException { } public Boolean existSessionVariable(final String varname) throws ServiceException { - return new ServerCallable(connPool, getTajoMasterAddr(), TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(manager, getTajoMasterAddr(), TajoMasterClientProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { checkSessionAndGet(client); @@ -247,8 +247,8 @@ public Map getCachedAllSessionVariables() { } public Map getAllSessionVariables() throws ServiceException { - return new ServerCallable>(connPool, getTajoMasterAddr(), TajoMasterClientProtocol.class, - false, true) { + return new ServerCallable>(manager, getTajoMasterAddr(), TajoMasterClientProtocol.class, + false) { public Map call(NettyClientBase client) throws ServiceException { checkSessionAndGet(client); @@ -260,7 +260,7 @@ public Map call(NettyClientBase client) throws ServiceException } public Boolean selectDatabase(final String databaseName) throws ServiceException { - return new ServerCallable(connPool, getTajoMasterAddr(), TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(manager, getTajoMasterAddr(), TajoMasterClientProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { checkSessionAndGet(client); @@ -278,13 +278,15 @@ public void close() { } // remove session + NettyClientBase client = null; try { - - NettyClientBase client = connPool.getConnection(getTajoMasterAddr(), TajoMasterClientProtocol.class, false); + client = manager.getClient(getTajoMasterAddr(), TajoMasterClientProtocol.class, false); TajoMasterClientProtocolService.BlockingInterface tajoMaster = client.getStub(); tajoMaster.removeSession(null, sessionId); - } catch (Throwable e) { + // ignore + } finally { + RpcClientManager.cleanup(client); } } @@ -321,7 +323,7 @@ protected void checkSessionAndGet(NettyClientBase client) throws ServiceExceptio } public boolean reconnect() throws Exception { - return new ServerCallable(connPool, getTajoMasterAddr(), TajoMasterClientProtocol.class, false, true) { + return new ServerCallable(manager, getTajoMasterAddr(), TajoMasterClientProtocol.class, false) { public Boolean call(NettyClientBase client) throws ServiceException { CreateSessionRequest.Builder builder = CreateSessionRequest.newBuilder(); diff --git a/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java b/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java index 668a770a5a..d2286cfeed 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/QueryInProgress.java @@ -33,7 +33,7 @@ import org.apache.tajo.plan.logical.LogicalRootNode; import org.apache.tajo.rpc.NettyClientBase; import org.apache.tajo.rpc.NullCallback; -import org.apache.tajo.rpc.RpcConnectionPool; +import org.apache.tajo.rpc.RpcClientManager; import org.apache.tajo.rpc.protocolrecords.PrimitiveProtos; import org.apache.tajo.session.Session; import org.apache.tajo.util.NetUtils; @@ -112,7 +112,7 @@ public void stopProgress() { masterContext.getResourceManager().releaseQueryMaster(queryId); if(queryMasterRpc != null) { - RpcConnectionPool.getPool().closeConnection(queryMasterRpc); + RpcClientManager.cleanup(queryMasterRpc); } try { @@ -157,7 +157,7 @@ private void connectQueryMaster() throws Exception { InetSocketAddress addr = NetUtils.createSocketAddr(queryInfo.getQueryMasterHost(), queryInfo.getQueryMasterPort()); LOG.info("Connect to QueryMaster:" + addr); queryMasterRpc = - RpcConnectionPool.getPool().getConnection(addr, QueryMasterProtocol.class, true); + RpcClientManager.getInstance().getClient(addr, QueryMasterProtocol.class, true); queryMasterRpcClient = queryMasterRpc.getStub(); } diff --git a/tajo-core/src/main/java/org/apache/tajo/master/TajoContainerProxy.java b/tajo-core/src/main/java/org/apache/tajo/master/TajoContainerProxy.java index 6128df3670..9d54ce1f67 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/TajoContainerProxy.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/TajoContainerProxy.java @@ -36,7 +36,7 @@ import org.apache.tajo.querymaster.QueryMasterTask; import org.apache.tajo.rpc.NettyClientBase; import org.apache.tajo.rpc.NullCallback; -import org.apache.tajo.rpc.RpcConnectionPool; +import org.apache.tajo.rpc.RpcClientManager; import org.apache.tajo.service.ServiceTracker; import org.apache.tajo.worker.TajoWorker; @@ -83,14 +83,12 @@ public void killTaskAttempt(TaskAttemptId taskAttemptId) { NettyClientBase tajoWorkerRpc = null; try { InetSocketAddress addr = new InetSocketAddress(container.getNodeId().getHost(), container.getNodeId().getPort()); - tajoWorkerRpc = RpcConnectionPool.getPool().getConnection(addr, TajoWorkerProtocol.class, true); + tajoWorkerRpc = RpcClientManager.getInstance().getClient(addr, TajoWorkerProtocol.class, true); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub(); tajoWorkerRpcClient.killTaskAttempt(null, taskAttemptId.getProto(), NullCallback.get()); } catch (Throwable e) { /* Worker RPC failure */ context.getEventHandler().handle(new TaskFatalErrorEvent(taskAttemptId, e.getMessage())); - } finally { - RpcConnectionPool.getPool().releaseConnection(tajoWorkerRpc); } } @@ -99,7 +97,7 @@ private void assignExecutionBlock(ExecutionBlockId executionBlockId, TajoContain try { InetSocketAddress addr = new InetSocketAddress(container.getNodeId().getHost(), container.getNodeId().getPort()); - tajoWorkerRpc = RpcConnectionPool.getPool().getConnection(addr, TajoWorkerProtocol.class, true); + tajoWorkerRpc = RpcClientManager.getInstance().getClient(addr, TajoWorkerProtocol.class, true); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub(); PlanProto.ShuffleType shuffleType = @@ -120,8 +118,6 @@ private void assignExecutionBlock(ExecutionBlockId executionBlockId, TajoContain tajoWorkerRpcClient.startExecutionBlock(null, request, NullCallback.get()); } catch (Throwable e) { LOG.error(e.getMessage(), e); - } finally { - RpcConnectionPool.getPool().releaseConnection(tajoWorkerRpc); } } @@ -173,23 +169,18 @@ public static void releaseWorkerResource(QueryMasterTask.QueryMasterTaskContext containerIdProtos.add(TajoWorkerContainerId.getContainerIdProto(eachContainerId)); } - RpcConnectionPool connPool = RpcConnectionPool.getPool(); + RpcClientManager manager = RpcClientManager.getInstance(); NettyClientBase tmClient = null; - try { - ServiceTracker serviceTracker = context.getQueryMasterContext().getWorkerContext().getServiceTracker(); - tmClient = connPool.getConnection(serviceTracker.getUmbilicalAddress(), QueryCoordinatorProtocol.class, true); - QueryCoordinatorProtocol.QueryCoordinatorProtocolService masterClientService = tmClient.getStub(); - masterClientService.releaseWorkerResource(null, - QueryCoordinatorProtocol.WorkerResourceReleaseRequest.newBuilder() - .setExecutionBlockId(executionBlockId.getProto()) - .addAllContainerIds(containerIdProtos) - .build(), - NullCallback.get()); - } catch (Throwable e) { - LOG.error(e.getMessage(), e); - } finally { - connPool.releaseConnection(tmClient); - } + ServiceTracker serviceTracker = context.getQueryMasterContext().getWorkerContext().getServiceTracker(); + tmClient = manager.getClient(serviceTracker.getUmbilicalAddress(), QueryCoordinatorProtocol.class, true); + + QueryCoordinatorProtocol.QueryCoordinatorProtocolService masterClientService = tmClient.getStub(); + masterClientService.releaseWorkerResource(null, + QueryCoordinatorProtocol.WorkerResourceReleaseRequest.newBuilder() + .setExecutionBlockId(executionBlockId.getProto()) + .addAllContainerIds(containerIdProtos) + .build(), + NullCallback.get()); } } diff --git a/tajo-core/src/main/java/org/apache/tajo/querymaster/QueryMaster.java b/tajo-core/src/main/java/org/apache/tajo/querymaster/QueryMaster.java index bf23133ba4..2b229559d1 100644 --- a/tajo-core/src/main/java/org/apache/tajo/querymaster/QueryMaster.java +++ b/tajo-core/src/main/java/org/apache/tajo/querymaster/QueryMaster.java @@ -41,7 +41,7 @@ import org.apache.tajo.rpc.CallFuture; import org.apache.tajo.rpc.NettyClientBase; import org.apache.tajo.rpc.NullCallback; -import org.apache.tajo.rpc.RpcConnectionPool; +import org.apache.tajo.rpc.RpcClientManager; import org.apache.tajo.rpc.protocolrecords.PrimitiveProtos; import org.apache.tajo.service.ServiceTracker; import org.apache.tajo.util.NetUtils; @@ -88,7 +88,7 @@ public class QueryMaster extends CompositeService implements EventHandler { private TajoWorker.WorkerContext workerContext; - private RpcConnectionPool connPool; + private RpcClientManager manager; private ExecutorService eventExecutor; @@ -104,7 +104,7 @@ public void init(Configuration conf) { } try { this.systemConf = (TajoConf)conf; - this.connPool = RpcConnectionPool.getPool(); + this.manager = RpcClientManager.getInstance(); querySessionTimeout = systemConf.getIntVar(TajoConf.ConfVars.QUERY_SESSION_TIMEOUT); queryMasterContext = new QueryMasterContext(systemConf); @@ -190,7 +190,7 @@ protected void cleanupExecutionBlock(List ex for (WorkerResourceProto worker : workers) { try { TajoProtos.WorkerConnectionInfoProto connectionInfo = worker.getConnectionInfo(); - rpc = connPool.getConnection(NetUtils.createSocketAddr(connectionInfo.getHost(), connectionInfo.getPeerRpcPort()), + rpc = manager.getClient(NetUtils.createSocketAddr(connectionInfo.getHost(), connectionInfo.getPeerRpcPort()), TajoWorkerProtocol.class, true); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerProtocolService = rpc.getStub(); @@ -200,8 +200,6 @@ protected void cleanupExecutionBlock(List ex continue; } catch (Exception e) { continue; - } finally { - connPool.releaseConnection(rpc); } } } @@ -214,15 +212,13 @@ private void cleanup(QueryId queryId) { for (WorkerResourceProto worker : workers) { try { TajoProtos.WorkerConnectionInfoProto connectionInfo = worker.getConnectionInfo(); - rpc = connPool.getConnection(NetUtils.createSocketAddr(connectionInfo.getHost(), connectionInfo.getPeerRpcPort()), + rpc = manager.getClient(NetUtils.createSocketAddr(connectionInfo.getHost(), connectionInfo.getPeerRpcPort()), TajoWorkerProtocol.class, true); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerProtocolService = rpc.getStub(); tajoWorkerProtocolService.cleanup(null, queryId.getProto(), NullCallback.get()); } catch (Exception e) { LOG.error(e.getMessage(), e); - } finally { - connPool.releaseConnection(rpc); } } } @@ -237,7 +233,7 @@ public List getAllWorker() { // update master address in worker context. ServiceTracker serviceTracker = workerContext.getServiceTracker(); - rpc = connPool.getConnection(serviceTracker.getUmbilicalAddress(), QueryCoordinatorProtocol.class, true); + rpc = manager.getClient(serviceTracker.getUmbilicalAddress(), QueryCoordinatorProtocol.class, true); QueryCoordinatorProtocolService masterService = rpc.getStub(); CallFuture callBack = new CallFuture(); @@ -248,8 +244,6 @@ public List getAllWorker() { return workerResourcesRequest.getWorkerResourcesList(); } catch (Exception e) { LOG.error(e.getMessage(), e); - } finally { - connPool.releaseConnection(rpc); } return new ArrayList(); } @@ -345,7 +339,7 @@ public void stopQuery(QueryId queryId) { NettyClientBase tmClient = null; try { - tmClient = connPool.getConnection(workerContext.getServiceTracker().getUmbilicalAddress(), + tmClient = manager.getClient(workerContext.getServiceTracker().getUmbilicalAddress(), QueryCoordinatorProtocol.class, true); QueryCoordinatorProtocolService masterClientService = tmClient.getStub(); @@ -355,8 +349,6 @@ public void stopQuery(QueryId queryId) { //When tajo do stop cluster, tajo master maybe throw closed connection exception LOG.error(e.getMessage(), e); - } finally { - connPool.releaseConnection(tmClient); } try { @@ -451,7 +443,7 @@ public void run() { try { ServiceTracker serviceTracker = queryMasterContext.getWorkerContext().getServiceTracker(); - tmClient = connPool.getConnection(serviceTracker.getUmbilicalAddress(), + tmClient = manager.getClient(serviceTracker.getUmbilicalAddress(), QueryCoordinatorProtocol.class, true); QueryCoordinatorProtocolService masterClientService = tmClient.getStub(); diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/ExecutionBlockContext.java b/tajo-core/src/main/java/org/apache/tajo/worker/ExecutionBlockContext.java index c2b63eb9da..9c3fbe4d85 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/ExecutionBlockContext.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/ExecutionBlockContext.java @@ -38,7 +38,7 @@ import org.apache.tajo.plan.serder.PlanProto; import org.apache.tajo.rpc.NettyClientBase; import org.apache.tajo.rpc.NullCallback; -import org.apache.tajo.rpc.RpcConnectionPool; +import org.apache.tajo.rpc.RpcClientManager; import org.apache.tajo.storage.HashShuffleAppenderManager; import org.apache.tajo.storage.StorageUtil; import org.apache.tajo.util.NetUtils; @@ -76,7 +76,7 @@ public class ExecutionBlockContext { private ExecutionBlockSharedResource resource; private TajoQueryEngine queryEngine; - private RpcConnectionPool connPool; + private RpcClientManager connManager; private InetSocketAddress qmMasterAddr; private WorkerConnectionInfo queryMaster; private TajoConf systemConf; @@ -100,7 +100,7 @@ public ExecutionBlockContext(TajoConf conf, TajoWorker.WorkerContext workerConte PlanProto.ShuffleType shuffleType) throws Throwable { this.manager = manager; this.executionBlockId = executionBlockId; - this.connPool = RpcConnectionPool.getPool(); + this.connManager = RpcClientManager.getInstance(); this.queryMaster = queryMaster; this.systemConf = conf; this.reporter = new Reporter(); @@ -149,13 +149,8 @@ public ExecutionBlockSharedResource getSharedResource() { public QueryMasterProtocol.QueryMasterProtocolService.Interface getQueryMasterStub() throws NoSuchMethodException, ConnectTimeoutException, ClassNotFoundException { - NettyClientBase clientBase = null; - try { - clientBase = connPool.getConnection(qmMasterAddr, QueryMasterProtocol.class, true); - return clientBase.getStub(); - } finally { - connPool.releaseConnection(clientBase); - } + NettyClientBase clientBase = connManager.getClient(qmMasterAddr, QueryMasterProtocol.class, true); + return clientBase.getStub(); } public void stop(){ diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/TajoResourceAllocator.java b/tajo-core/src/main/java/org/apache/tajo/worker/TajoResourceAllocator.java index 47a9fda43e..49cb1e9af6 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/TajoResourceAllocator.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/TajoResourceAllocator.java @@ -50,16 +50,14 @@ import org.apache.tajo.rpc.CallFuture; import org.apache.tajo.rpc.NettyClientBase; import org.apache.tajo.rpc.NullCallback; -import org.apache.tajo.rpc.RpcConnectionPool; +import org.apache.tajo.rpc.RpcClientManager; +import org.apache.tajo.rpc.protocolrecords.PrimitiveProtos; import org.apache.tajo.service.ServiceTracker; import org.apache.tajo.util.ApplicationIdUtils; import java.net.InetSocketAddress; import java.util.*; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; +import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicBoolean; public class TajoResourceAllocator extends AbstractResourceAllocator { @@ -193,14 +191,12 @@ private void stopExecutionBlock(ExecutionBlockId executionBlockId, NodeId worker NettyClientBase tajoWorkerRpc = null; try { InetSocketAddress addr = new InetSocketAddress(worker.getHost(), worker.getPort()); - tajoWorkerRpc = RpcConnectionPool.getPool().getConnection(addr, TajoWorkerProtocol.class, true); + tajoWorkerRpc = RpcClientManager.getInstance().getClient(addr, TajoWorkerProtocol.class, true); TajoWorkerProtocol.TajoWorkerProtocolService tajoWorkerRpcClient = tajoWorkerRpc.getStub(); tajoWorkerRpcClient.stopExecutionBlock(null, executionBlockId.getProto(), NullCallback.get()); } catch (Throwable e) { LOG.error(e.getMessage(), e); - } finally { - RpcConnectionPool.getPool().releaseConnection(tajoWorkerRpc); } } @@ -278,17 +274,16 @@ public void run() { .setQueryId(event.getExecutionBlockId().getQueryId().getProto()) .build(); - RpcConnectionPool connPool = RpcConnectionPool.getPool(); + NettyClientBase tmClient = null; try { ServiceTracker serviceTracker = queryTaskContext.getQueryMasterContext().getWorkerContext().getServiceTracker(); - tmClient = connPool.getConnection(serviceTracker.getUmbilicalAddress(), QueryCoordinatorProtocol.class, true); + tmClient = RpcClientManager.getInstance(). + getClient(serviceTracker.getUmbilicalAddress(), QueryCoordinatorProtocol.class, true); QueryCoordinatorProtocolService masterClientService = tmClient.getStub(); masterClientService.allocateWorkerResources(null, request, callBack); } catch (Throwable e) { LOG.error(e.getMessage(), e); - } finally { - connPool.releaseConnection(tmClient); } WorkerResourceAllocationResponse response = null; diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/WorkerHeartbeatService.java b/tajo-core/src/main/java/org/apache/tajo/worker/WorkerHeartbeatService.java index 5493b37cbb..ad67f94adb 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/WorkerHeartbeatService.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/WorkerHeartbeatService.java @@ -18,7 +18,6 @@ package org.apache.tajo.worker; -import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.protobuf.ServiceException; import org.apache.commons.logging.Log; @@ -26,14 +25,13 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.service.AbstractService; import org.apache.tajo.conf.TajoConf; -import org.apache.tajo.ha.HAServiceUtil; import org.apache.tajo.ipc.QueryCoordinatorProtocol.ClusterResourceSummary; import org.apache.tajo.ipc.QueryCoordinatorProtocol.ServerStatusProto; import org.apache.tajo.ipc.QueryCoordinatorProtocol.TajoHeartbeatResponse; import org.apache.tajo.ipc.TajoResourceTrackerProtocol; import org.apache.tajo.rpc.CallFuture; import org.apache.tajo.rpc.NettyClientBase; -import org.apache.tajo.rpc.RpcConnectionPool; +import org.apache.tajo.rpc.RpcClientManager; import org.apache.tajo.rpc.protocolrecords.PrimitiveProtos; import org.apache.tajo.service.ServiceTracker; import org.apache.tajo.storage.DiskDeviceInfo; @@ -57,7 +55,7 @@ public class WorkerHeartbeatService extends AbstractService { private final TajoWorker.WorkerContext context; private TajoConf systemConf; - private RpcConnectionPool connectionPool; + private RpcClientManager connectionManager; private WorkerHeartbeatThread thread; private static final float HDFS_DATANODE_STORAGE_SIZE; @@ -72,10 +70,12 @@ public WorkerHeartbeatService(TajoWorker.WorkerContext context) { @Override public void serviceInit(Configuration conf) throws Exception { - Preconditions.checkArgument(conf instanceof TajoConf, "Configuration must be a TajoConf instance."); + if (!(conf instanceof TajoConf)) { + throw new IllegalArgumentException("Configuration must be a TajoConf instance"); + } this.systemConf = (TajoConf) conf; - connectionPool = RpcConnectionPool.getPool(); + this.connectionManager = RpcClientManager.getInstance(); super.serviceInit(conf); } @@ -184,7 +184,7 @@ public void run() { CallFuture callBack = new CallFuture(); ServiceTracker serviceTracker = context.getServiceTracker(); - rmClient = connectionPool.getConnection(serviceTracker.getResourceTrackerAddress(), + rmClient = connectionManager.getClient(serviceTracker.getResourceTrackerAddress(), TajoResourceTrackerProtocol.class, true); TajoResourceTrackerProtocol.TajoResourceTrackerProtocolService resourceTracker = rmClient.getStub(); resourceTracker.heartbeat(callBack.getController(), heartbeatProto, callBack); @@ -207,8 +207,6 @@ public void run() { LOG.warn("Heartbeat response is being delayed.", te); } catch (Exception e) { LOG.error(e.getMessage(), e); - } finally { - connectionPool.releaseConnection(rmClient); } try { diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/rule/ConnectivityCheckerRuleForTajoWorker.java b/tajo-core/src/main/java/org/apache/tajo/worker/rule/ConnectivityCheckerRuleForTajoWorker.java index 4b76c732fc..f94bd78d23 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/rule/ConnectivityCheckerRuleForTajoWorker.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/rule/ConnectivityCheckerRuleForTajoWorker.java @@ -19,20 +19,15 @@ package org.apache.tajo.worker.rule; import org.apache.tajo.conf.TajoConf; -import org.apache.tajo.conf.TajoConf.ConfVars; -import org.apache.tajo.ha.HAServiceUtil; import org.apache.tajo.ipc.QueryCoordinatorProtocol; import org.apache.tajo.rpc.NettyClientBase; -import org.apache.tajo.rpc.RpcConnectionPool; +import org.apache.tajo.rpc.RpcClientManager; import org.apache.tajo.rule.*; import org.apache.tajo.rule.EvaluationResult.EvaluationResultCode; import org.apache.tajo.service.ServiceTracker; import org.apache.tajo.service.ServiceTrackerFactory; -import org.apache.tajo.util.NetUtils; import org.apache.tajo.worker.TajoWorker; -import java.net.InetSocketAddress; - /** * With this rule, Tajo worker will check the connectivity to tajo master server. */ @@ -42,20 +37,11 @@ public class ConnectivityCheckerRuleForTajoWorker implements SelfDiagnosisRule { private void checkTajoMasterConnectivity(TajoConf tajoConf) throws Exception { - RpcConnectionPool pool = RpcConnectionPool.getPool(); - NettyClientBase masterClient = null; - InetSocketAddress masterAddress = null; - - try { - ServiceTracker serviceTracker = ServiceTrackerFactory.get(tajoConf); - masterClient = pool.getConnection(serviceTracker.getUmbilicalAddress(), QueryCoordinatorProtocol.class, true); - masterClient.getStub(); - } finally { - if (masterClient != null) { - pool.releaseConnection(masterClient); - } - } - + RpcClientManager manager = RpcClientManager.getInstance(); + + ServiceTracker serviceTracker = ServiceTrackerFactory.get(tajoConf); + NettyClientBase masterClient = manager.getClient(serviceTracker.getUmbilicalAddress(), QueryCoordinatorProtocol.class, true); + masterClient.getStub(); } @Override diff --git a/tajo-rpc/src/main/java/org/apache/tajo/rpc/AsyncRpcClient.java b/tajo-rpc/src/main/java/org/apache/tajo/rpc/AsyncRpcClient.java index 5845229a70..e6dbf2c7e9 100644 --- a/tajo-rpc/src/main/java/org/apache/tajo/rpc/AsyncRpcClient.java +++ b/tajo-rpc/src/main/java/org/apache/tajo/rpc/AsyncRpcClient.java @@ -20,76 +20,53 @@ import com.google.protobuf.Descriptors.MethodDescriptor; import com.google.protobuf.*; - import io.netty.channel.*; +import io.netty.handler.timeout.IdleState; +import io.netty.handler.timeout.IdleStateEvent; +import io.netty.util.concurrent.GenericFutureListener; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tajo.rpc.RpcClientManager.RpcConnectionKey; import org.apache.tajo.rpc.RpcProtos.RpcRequest; import org.apache.tajo.rpc.RpcProtos.RpcResponse; -import io.netty.util.ReferenceCountUtil; -import io.netty.util.concurrent.GenericFutureListener; - import java.lang.reflect.Method; import java.net.InetSocketAddress; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.atomic.AtomicInteger; - -import static org.apache.tajo.rpc.RpcConnectionPool.RpcConnectionKey; +import java.util.concurrent.ConcurrentMap; public class AsyncRpcClient extends NettyClientBase { private static final Log LOG = LogFactory.getLog(AsyncRpcClient.class); - private final ChannelInitializer initializer; - private final ProxyRpcChannel rpcChannel; - - private final AtomicInteger sequence = new AtomicInteger(0); - private final Map requests = + private final ConcurrentMap requests = new ConcurrentHashMap(); - private final Class protocol; private final Method stubMethod; - - private RpcConnectionKey key; + private final ProxyRpcChannel rpcChannel; + private final ClientChannelInboundHandler inboundHandler; /** * Intentionally make this method package-private, avoiding user directly * new an instance through this constructor. */ - AsyncRpcClient(final Class protocol, - final InetSocketAddress addr, int retries) - throws ClassNotFoundException, NoSuchMethodException, ConnectTimeoutException { - - this.protocol = protocol; - String serviceClassName = protocol.getName() + "$" - + protocol.getSimpleName() + "Service"; - Class serviceClass = Class.forName(serviceClassName); - stubMethod = serviceClass.getMethod("newStub", RpcChannel.class); - - initializer = new ProtoChannelInitializer(new ClientChannelInboundHandler(), - RpcResponse.getDefaultInstance()); - super.init(addr, initializer, retries); - rpcChannel = new ProxyRpcChannel(); - this.key = new RpcConnectionKey(addr, protocol, true); + AsyncRpcClient(RpcConnectionKey rpcConnectionKey, int retries) + throws ClassNotFoundException, NoSuchMethodException { + this(rpcConnectionKey, retries, 0); } - @Override - public RpcConnectionKey getKey() { - return key; + AsyncRpcClient(RpcConnectionKey rpcConnectionKey, int retries, int idleTimeSeconds) + throws ClassNotFoundException, NoSuchMethodException { + super(rpcConnectionKey, retries); + stubMethod = getServiceClass().getMethod("newStub", RpcChannel.class); + rpcChannel = new ProxyRpcChannel(); + inboundHandler = new ClientChannelInboundHandler(); + init(new ProtoChannelInitializer(inboundHandler, RpcResponse.getDefaultInstance(), idleTimeSeconds)); } @Override public T getStub() { - try { - return (T) stubMethod.invoke(null, rpcChannel); - } catch (Exception e) { - throw new RemoteException(e.getMessage(), e); - } - } - - public RpcChannel getRpcChannel() { - return this.rpcChannel; + return getStub(stubMethod, rpcChannel); } protected void sendExceptions(String message) { @@ -113,17 +90,6 @@ public void close() { } private class ProxyRpcChannel implements RpcChannel { - private final ClientChannelInboundHandler handler; - - public ProxyRpcChannel() { - this.handler = getChannel().pipeline() - .get(ClientChannelInboundHandler.class); - - if (handler == null) { - throw new IllegalArgumentException("Channel does not have " + - "proper handler"); - } - } public void callMethod(final MethodDescriptor method, final RpcController controller, @@ -135,7 +101,7 @@ public void callMethod(final MethodDescriptor method, Message rpcRequest = buildRequest(nextSeqId, method, param); - handler.registerCallback(nextSeqId, + inboundHandler.registerCallback(nextSeqId, new ResponseCallback(controller, responseType, done)); ChannelPromise channelPromise = getChannel().newPromise(); @@ -144,7 +110,7 @@ public void callMethod(final MethodDescriptor method, @Override public void operationComplete(ChannelFuture future) throws Exception { if (!future.isSuccess()) { - handler.exceptionCaught(null, new ServiceException(future.cause())); + inboundHandler.exceptionCaught(null, new ServiceException(future.cause())); } } }); @@ -160,7 +126,7 @@ private Message buildRequest(int seqId, .setMethodName(method.getName()); if (param != null) { - requestBuilder.setRequestMessage(param.toByteString()); + requestBuilder.setRequestMessage(param.toByteString()); } return requestBuilder.build(); @@ -215,52 +181,56 @@ private String getErrorMessage(String message) { } @ChannelHandler.Sharable - private class ClientChannelInboundHandler extends ChannelInboundHandlerAdapter { + private class ClientChannelInboundHandler extends SimpleChannelInboundHandler { - synchronized void registerCallback(int seqId, ResponseCallback callback) { + void registerCallback(int seqId, ResponseCallback callback) { - if (requests.containsKey(seqId)) { + if (requests.putIfAbsent(seqId, callback) != null) { throw new RemoteException( getErrorMessage("Duplicate Sequence Id "+ seqId)); } - - requests.put(seqId, callback); } @Override - public void channelRead(ChannelHandlerContext ctx, Object msg) - throws Exception { - if (msg instanceof RpcResponse) { - try { - RpcResponse response = (RpcResponse) msg; - ResponseCallback callback = requests.remove(response.getId()); + protected void channelRead0(ChannelHandlerContext ctx, RpcResponse response) throws Exception { + ResponseCallback callback = requests.remove(response.getId()); - if (callback == null) { - LOG.warn("Dangling rpc call"); - } else { - callback.run(response); - } - } finally { - ReferenceCountUtil.release(msg); - } + if (callback == null) { + LOG.warn("Dangling rpc call"); + } else { + callback.run(response); } } + @Override + public void channelActive(ChannelHandlerContext ctx) throws Exception { + super.channelActive(ctx); + LOG.info("Connection established successfully : " + ctx.channel().remoteAddress()); + } + @Override public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) throws Exception { LOG.error(getRemoteAddress() + "," + protocol + "," + cause.getMessage(), cause); sendExceptions(cause.getMessage()); - + if(LOG.isDebugEnabled()) { LOG.error(cause.getMessage(), cause); } else { LOG.error("RPC Exception:" + cause.getMessage()); } - - if (ctx != null && ctx.channel().isActive()) { - ctx.channel().close(); + } + + @Override + public void userEventTriggered(ChannelHandlerContext ctx, Object evt) throws Exception { + if (evt instanceof IdleStateEvent) { + IdleStateEvent e = (IdleStateEvent) evt; + /* If all requests is done and event is triggered, channel will be closed. */ + if (e.state() == IdleState.ALL_IDLE && requests.size() == 0) { + ctx.close(); + LOG.warn("Idle connection closed successfully :" + ctx.channel().remoteAddress()); + } } } } diff --git a/tajo-rpc/src/main/java/org/apache/tajo/rpc/AsyncRpcServer.java b/tajo-rpc/src/main/java/org/apache/tajo/rpc/AsyncRpcServer.java index 3b5a747686..e4109fef92 100644 --- a/tajo-rpc/src/main/java/org/apache/tajo/rpc/AsyncRpcServer.java +++ b/tajo-rpc/src/main/java/org/apache/tajo/rpc/AsyncRpcServer.java @@ -18,17 +18,17 @@ package org.apache.tajo.rpc; -import com.google.protobuf.*; import com.google.protobuf.Descriptors.MethodDescriptor; - +import com.google.protobuf.Message; +import com.google.protobuf.RpcCallback; +import com.google.protobuf.RpcController; +import com.google.protobuf.Service; import io.netty.channel.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tajo.rpc.RpcProtos.RpcRequest; import org.apache.tajo.rpc.RpcProtos.RpcResponse; -import io.netty.util.ReferenceCountUtil; - import java.lang.reflect.Method; import java.net.InetSocketAddress; @@ -57,7 +57,7 @@ public AsyncRpcServer(final Class protocol, } @ChannelHandler.Sharable - private class ServerHandler extends ChannelInboundHandlerAdapter { + private class ServerHandler extends SimpleChannelInboundHandler { @Override public void channelRegistered(ChannelHandlerContext ctx) throws Exception { @@ -78,55 +78,46 @@ public void channelUnregistered(ChannelHandlerContext ctx) throws Exception { } @Override - public void channelRead(final ChannelHandlerContext ctx, Object msg) - throws Exception { - if (msg instanceof RpcRequest) { - try { - final RpcRequest request = (RpcRequest) msg; - - String methodName = request.getMethodName(); - MethodDescriptor methodDescriptor = service.getDescriptorForType().findMethodByName(methodName); + protected void channelRead0(final ChannelHandlerContext ctx, final RpcRequest request) throws Exception { - if (methodDescriptor == null) { - throw new RemoteCallException(request.getId(), new NoSuchMethodException(methodName)); - } - - Message paramProto = null; - if (request.hasRequestMessage()) { - try { - paramProto = service.getRequestPrototype(methodDescriptor).newBuilderForType() - .mergeFrom(request.getRequestMessage()).build(); - } catch (Throwable t) { - throw new RemoteCallException(request.getId(), methodDescriptor, t); - } - } + String methodName = request.getMethodName(); + MethodDescriptor methodDescriptor = service.getDescriptorForType().findMethodByName(methodName); - final RpcController controller = new NettyRpcController(); + if (methodDescriptor == null) { + throw new RemoteCallException(request.getId(), new NoSuchMethodException(methodName)); + } - RpcCallback callback = !request.hasId() ? null : new RpcCallback() { + Message paramProto = null; + if (request.hasRequestMessage()) { + try { + paramProto = service.getRequestPrototype(methodDescriptor).newBuilderForType() + .mergeFrom(request.getRequestMessage()).build(); + } catch (Throwable t) { + throw new RemoteCallException(request.getId(), methodDescriptor, t); + } + } - public void run(Message returnValue) { + final RpcController controller = new NettyRpcController(); - RpcResponse.Builder builder = RpcResponse.newBuilder().setId(request.getId()); + RpcCallback callback = !request.hasId() ? null : new RpcCallback() { - if (returnValue != null) { - builder.setResponseMessage(returnValue.toByteString()); - } + public void run(Message returnValue) { - if (controller.failed()) { - builder.setErrorMessage(controller.errorText()); - } + RpcResponse.Builder builder = RpcResponse.newBuilder().setId(request.getId()); - ctx.writeAndFlush(builder.build()); - } - }; + if (returnValue != null) { + builder.setResponseMessage(returnValue.toByteString()); + } - service.callMethod(methodDescriptor, controller, paramProto, callback); + if (controller.failed()) { + builder.setErrorMessage(controller.errorText()); + } - } finally { - ReferenceCountUtil.release(msg); + ctx.writeAndFlush(builder.build()); } - } + }; + + service.callMethod(methodDescriptor, controller, paramProto, callback); } @Override @@ -138,11 +129,6 @@ public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) } else { LOG.error(cause.getMessage()); } - - if (ctx != null && ctx.channel().isActive()) { - ctx.channel().close(); - } } - } } \ No newline at end of file diff --git a/tajo-rpc/src/main/java/org/apache/tajo/rpc/BlockingRpcClient.java b/tajo-rpc/src/main/java/org/apache/tajo/rpc/BlockingRpcClient.java index 4ec57188e5..61e2f045f3 100644 --- a/tajo-rpc/src/main/java/org/apache/tajo/rpc/BlockingRpcClient.java +++ b/tajo-rpc/src/main/java/org/apache/tajo/rpc/BlockingRpcClient.java @@ -18,80 +18,57 @@ package org.apache.tajo.rpc; -import com.google.protobuf.*; +import com.google.protobuf.BlockingRpcChannel; import com.google.protobuf.Descriptors.MethodDescriptor; - +import com.google.protobuf.Message; +import com.google.protobuf.RpcController; +import com.google.protobuf.ServiceException; import io.netty.channel.*; -import io.netty.util.concurrent.*; +import io.netty.handler.timeout.IdleState; +import io.netty.handler.timeout.IdleStateEvent; +import io.netty.util.concurrent.GenericFutureListener; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.tajo.rpc.RpcClientManager.RpcConnectionKey; import org.apache.tajo.rpc.RpcProtos.RpcRequest; import org.apache.tajo.rpc.RpcProtos.RpcResponse; -import io.netty.util.ReferenceCountUtil; - import java.lang.reflect.Method; import java.net.InetSocketAddress; import java.util.Map; import java.util.concurrent.*; -import java.util.concurrent.Future; -import java.util.concurrent.atomic.AtomicInteger; - -import static org.apache.tajo.rpc.RpcConnectionPool.RpcConnectionKey; public class BlockingRpcClient extends NettyClientBase { private static final Log LOG = LogFactory.getLog(RpcProtos.class); - private final ChannelInitializer initializer; - private final ProxyRpcChannel rpcChannel; - - private final AtomicInteger sequence = new AtomicInteger(0); private final Map requests = new ConcurrentHashMap(); - private final Class protocol; private final Method stubMethod; - - private RpcConnectionKey key; + private final ProxyRpcChannel rpcChannel; + private final ChannelInboundHandlerAdapter inboundHandler; /** * Intentionally make this method package-private, avoiding user directly * new an instance through this constructor. */ - BlockingRpcClient(final Class protocol, - final InetSocketAddress addr, int retries) - throws ClassNotFoundException, NoSuchMethodException, ConnectTimeoutException { - - this.protocol = protocol; - String serviceClassName = protocol.getName() + "$" - + protocol.getSimpleName() + "Service"; - Class serviceClass = Class.forName(serviceClassName); - stubMethod = serviceClass.getMethod("newBlockingStub", - BlockingRpcChannel.class); - - initializer = new ProtoChannelInitializer(new ClientChannelInboundHandler(), RpcResponse.getDefaultInstance()); - super.init(addr, initializer, retries); - rpcChannel = new ProxyRpcChannel(); - - this.key = new RpcConnectionKey(addr, protocol, false); + BlockingRpcClient(RpcConnectionKey rpcConnectionKey, int retries) + throws NoSuchMethodException, ClassNotFoundException { + this(rpcConnectionKey, retries, 0); } - @Override - public RpcConnectionKey getKey() { - return key; + BlockingRpcClient(RpcConnectionKey rpcConnectionKey, int retries, int idleTimeSeconds) + throws ClassNotFoundException, NoSuchMethodException { + super(rpcConnectionKey, retries); + stubMethod = getServiceClass().getMethod("newBlockingStub", BlockingRpcChannel.class); + rpcChannel = new ProxyRpcChannel(); + inboundHandler = new ClientChannelInboundHandler(); + init(new ProtoChannelInitializer(inboundHandler, RpcResponse.getDefaultInstance(), idleTimeSeconds)); } @Override public T getStub() { - try { - return (T) stubMethod.invoke(null, rpcChannel); - } catch (Exception e) { - throw new RuntimeException(e.getMessage(), e); - } - } - - public BlockingRpcChannel getBlockingRpcChannel() { - return this.rpcChannel; + return getStub(stubMethod, rpcChannel); } @Override @@ -100,25 +77,12 @@ public void close() { callback.setFailed("BlockingRpcClient terminates all the connections", new ServiceException("BlockingRpcClient terminates all the connections")); } - + requests.clear(); super.close(); } private class ProxyRpcChannel implements BlockingRpcChannel { - private final ClientChannelInboundHandler handler; - - public ProxyRpcChannel() { - - this.handler = getChannel().pipeline(). - get(ClientChannelInboundHandler.class); - - if (handler == null) { - throw new IllegalArgumentException("Channel does not have " + - "proper handler"); - } - } - @Override public Message callBlockingMethod(final MethodDescriptor method, final RpcController controller, @@ -139,7 +103,7 @@ public Message callBlockingMethod(final MethodDescriptor method, @Override public void operationComplete(ChannelFuture future) throws Exception { if (!future.isSuccess()) { - handler.exceptionCaught(null, new ServiceException(future.cause())); + inboundHandler.exceptionCaught(null, new ServiceException(future.cause())); } } }); @@ -174,7 +138,7 @@ private Message buildRequest(int seqId, } private String getErrorMessage(String message) { - if(protocol != null && getChannel() != null) { + if(getChannel() != null) { return protocol.getName() + "(" + RpcUtils.normalizeInetSocketAddress((InetSocketAddress) getChannel().remoteAddress()) + "): " + message; @@ -184,7 +148,7 @@ private String getErrorMessage(String message) { } private TajoServiceException makeTajoServiceException(RpcResponse response, Throwable cause) { - if(protocol != null && getChannel() != null) { + if(getChannel() != null) { return new TajoServiceException(response.getErrorMessage(), cause, protocol.getName(), RpcUtils.normalizeInetSocketAddress((InetSocketAddress)getChannel().remoteAddress())); } else { @@ -193,39 +157,29 @@ private TajoServiceException makeTajoServiceException(RpcResponse response, Thro } @ChannelHandler.Sharable - private class ClientChannelInboundHandler extends ChannelInboundHandlerAdapter { + private class ClientChannelInboundHandler extends SimpleChannelInboundHandler { @Override - public void channelRead(ChannelHandlerContext ctx, Object msg) - throws Exception { - - if (msg instanceof RpcResponse) { - try { - RpcResponse rpcResponse = (RpcResponse) msg; - ProtoCallFuture callback = requests.remove(rpcResponse.getId()); + protected void channelRead0(ChannelHandlerContext ctx, RpcResponse rpcResponse) throws Exception { + ProtoCallFuture callback = requests.remove(rpcResponse.getId()); - if (callback == null) { - LOG.warn("Dangling rpc call"); + if (callback == null) { + LOG.warn("Dangling rpc call"); + } else { + if (rpcResponse.hasErrorMessage()) { + callback.setFailed(rpcResponse.getErrorMessage(), + makeTajoServiceException(rpcResponse, new ServiceException(rpcResponse.getErrorTrace()))); + } else { + Message responseMessage; + + if (!rpcResponse.hasResponseMessage()) { + responseMessage = null; } else { - if (rpcResponse.hasErrorMessage()) { - callback.setFailed(rpcResponse.getErrorMessage(), - makeTajoServiceException(rpcResponse, new ServiceException(rpcResponse.getErrorTrace()))); - throw new RemoteException(getErrorMessage(rpcResponse.getErrorMessage())); - } else { - Message responseMessage; - - if (!rpcResponse.hasResponseMessage()) { - responseMessage = null; - } else { - responseMessage = callback.returnType.newBuilderForType().mergeFrom(rpcResponse.getResponseMessage()) - .build(); - } - - callback.setResponse(responseMessage); - } + responseMessage = callback.returnType.newBuilderForType().mergeFrom(rpcResponse.getResponseMessage()) + .build(); } - } finally { - ReferenceCountUtil.release(msg); + + callback.setResponse(responseMessage); } } } @@ -233,22 +187,39 @@ public void channelRead(ChannelHandlerContext ctx, Object msg) @Override public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) throws Exception { + /* Current requests will be failed */ for(ProtoCallFuture callback: requests.values()) { callback.setFailed(cause.getMessage(), cause); } - + requests.clear(); + if(LOG.isDebugEnabled()) { LOG.error("" + cause.getMessage(), cause); } else { LOG.error("RPC Exception:" + cause.getMessage()); } - if (ctx != null && ctx.channel().isActive()) { - ctx.channel().close(); + } + + @Override + public void channelActive(ChannelHandlerContext ctx) throws Exception { + super.channelActive(ctx); + LOG.info("Connection established successfully : " + ctx.channel().remoteAddress()); + } + + @Override + public void userEventTriggered(ChannelHandlerContext ctx, Object evt) throws Exception { + if (evt instanceof IdleStateEvent) { + IdleStateEvent e = (IdleStateEvent) evt; + /* If all requests is done and event is triggered, channel will be closed. */ + if (e.state() == IdleState.ALL_IDLE && requests.size() == 0) { + ctx.close(); + LOG.warn("Idle connection closed successfully :" + ctx.channel().remoteAddress()); + } } } } - static class ProtoCallFuture implements Future { + static class ProtoCallFuture implements Future { private Semaphore sem = new Semaphore(0); private Message response = null; private Message returnType; diff --git a/tajo-rpc/src/main/java/org/apache/tajo/rpc/BlockingRpcServer.java b/tajo-rpc/src/main/java/org/apache/tajo/rpc/BlockingRpcServer.java index 0ce359fb98..bb3136732f 100644 --- a/tajo-rpc/src/main/java/org/apache/tajo/rpc/BlockingRpcServer.java +++ b/tajo-rpc/src/main/java/org/apache/tajo/rpc/BlockingRpcServer.java @@ -22,15 +22,12 @@ import com.google.protobuf.Descriptors.MethodDescriptor; import com.google.protobuf.Message; import com.google.protobuf.RpcController; - import io.netty.channel.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tajo.rpc.RpcProtos.RpcRequest; import org.apache.tajo.rpc.RpcProtos.RpcResponse; -import io.netty.util.ReferenceCountUtil; - import java.lang.reflect.Method; import java.net.InetSocketAddress; @@ -62,7 +59,7 @@ public BlockingRpcServer(final Class protocol, } @ChannelHandler.Sharable - private class ServerHandler extends ChannelInboundHandlerAdapter { + private class ServerHandler extends SimpleChannelInboundHandler { @Override public void channelRegistered(ChannelHandlerContext ctx) throws Exception { @@ -83,52 +80,43 @@ public void channelUnregistered(ChannelHandlerContext ctx) throws Exception { } @Override - public void channelRead(ChannelHandlerContext ctx, Object msg) - throws Exception { + protected void channelRead0(ChannelHandlerContext ctx, RpcRequest request) throws Exception { - if (msg instanceof RpcRequest) { + String methodName = request.getMethodName(); + MethodDescriptor methodDescriptor = service.getDescriptorForType().findMethodByName(methodName); + + if (methodDescriptor == null) { + throw new RemoteCallException(request.getId(), new NoSuchMethodException(methodName)); + } + Message paramProto = null; + if (request.hasRequestMessage()) { try { - final RpcRequest request = (RpcRequest) msg; - - String methodName = request.getMethodName(); - MethodDescriptor methodDescriptor = service.getDescriptorForType().findMethodByName(methodName); - - if (methodDescriptor == null) { - throw new RemoteCallException(request.getId(), new NoSuchMethodException(methodName)); - } - Message paramProto = null; - if (request.hasRequestMessage()) { - try { - paramProto = service.getRequestPrototype(methodDescriptor).newBuilderForType() - .mergeFrom(request.getRequestMessage()).build(); - - } catch (Throwable t) { - throw new RemoteCallException(request.getId(), methodDescriptor, t); - } - } - Message returnValue; - RpcController controller = new NettyRpcController(); - - try { - returnValue = service.callBlockingMethod(methodDescriptor, controller, paramProto); - } catch (Throwable t) { - throw new RemoteCallException(request.getId(), methodDescriptor, t); - } - - RpcResponse.Builder builder = RpcResponse.newBuilder().setId(request.getId()); - - if (returnValue != null) { - builder.setResponseMessage(returnValue.toByteString()); - } - - if (controller.failed()) { - builder.setErrorMessage(controller.errorText()); - } - ctx.writeAndFlush(builder.build()); - } finally { - ReferenceCountUtil.release(msg); + paramProto = service.getRequestPrototype(methodDescriptor).newBuilderForType() + .mergeFrom(request.getRequestMessage()).build(); + + } catch (Throwable t) { + throw new RemoteCallException(request.getId(), methodDescriptor, t); } } + Message returnValue; + RpcController controller = new NettyRpcController(); + + try { + returnValue = service.callBlockingMethod(methodDescriptor, controller, paramProto); + } catch (Throwable t) { + throw new RemoteCallException(request.getId(), methodDescriptor, t); + } + + RpcResponse.Builder builder = RpcResponse.newBuilder().setId(request.getId()); + + if (returnValue != null) { + builder.setResponseMessage(returnValue.toByteString()); + } + + if (controller.failed()) { + builder.setErrorMessage(controller.errorText()); + } + ctx.writeAndFlush(builder.build()); } @Override @@ -137,11 +125,6 @@ public void exceptionCaught(ChannelHandlerContext ctx, Throwable cause) { RemoteCallException callException = (RemoteCallException) cause; ctx.writeAndFlush(callException.getResponse()); } - - if (ctx != null && ctx.channel().isActive()) { - ctx.channel().close(); - } } - } } \ No newline at end of file diff --git a/tajo-rpc/src/main/java/org/apache/tajo/rpc/ConnectionCloseFutureListener.java b/tajo-rpc/src/main/java/org/apache/tajo/rpc/ConnectionCloseFutureListener.java new file mode 100644 index 0000000000..29c977265c --- /dev/null +++ b/tajo-rpc/src/main/java/org/apache/tajo/rpc/ConnectionCloseFutureListener.java @@ -0,0 +1,35 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.rpc; + +import io.netty.util.concurrent.Future; +import io.netty.util.concurrent.GenericFutureListener; + +public class ConnectionCloseFutureListener implements GenericFutureListener { + private RpcClientManager.RpcConnectionKey key; + + public ConnectionCloseFutureListener(RpcClientManager.RpcConnectionKey key) { + this.key = key; + } + + @Override + public void operationComplete(Future future) throws Exception { + RpcClientManager.remove(key); + } +} diff --git a/tajo-rpc/src/main/java/org/apache/tajo/rpc/NettyClientBase.java b/tajo-rpc/src/main/java/org/apache/tajo/rpc/NettyClientBase.java index 7b521781db..a75148be48 100644 --- a/tajo-rpc/src/main/java/org/apache/tajo/rpc/NettyClientBase.java +++ b/tajo-rpc/src/main/java/org/apache/tajo/rpc/NettyClientBase.java @@ -18,156 +18,150 @@ package org.apache.tajo.rpc; -import io.netty.channel.*; - -import org.apache.commons.lang.exception.ExceptionUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - import io.netty.bootstrap.Bootstrap; import io.netty.buffer.PooledByteBufAllocator; +import io.netty.channel.*; import io.netty.channel.socket.nio.NioSocketChannel; -import io.netty.util.concurrent.GenericFutureListener; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.tajo.rpc.RpcClientManager.RpcConnectionKey; import java.io.Closeable; +import java.lang.reflect.Method; import java.net.InetSocketAddress; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.TimeUnit; +import java.net.SocketAddress; import java.util.concurrent.atomic.AtomicInteger; public abstract class NettyClientBase implements Closeable { - private static Log LOG = LogFactory.getLog(NettyClientBase.class); - private static final int CLIENT_CONNECTION_TIMEOUT_SEC = 60; + private static final Log LOG = LogFactory.getLog(NettyClientBase.class); + private static final int CONNECTION_TIMEOUT = 60000; // 60 sec private static final long PAUSE = 1000; // 1 sec - private int numRetries; - protected Bootstrap bootstrap; - private ChannelFuture channelFuture; + private final int numRetries; - public NettyClientBase() { - } + private Bootstrap bootstrap; + private volatile ChannelFuture channelFuture; - public abstract T getStub(); - public abstract RpcConnectionPool.RpcConnectionKey getKey(); - - public void init(InetSocketAddress addr, ChannelInitializer initializer, - int numRetries) throws ConnectTimeoutException { + protected final Class protocol; + protected final AtomicInteger sequence = new AtomicInteger(0); + + private final RpcConnectionKey key; + + public NettyClientBase(RpcConnectionKey rpcConnectionKey, int numRetries) + throws ClassNotFoundException, NoSuchMethodException { + this.key = rpcConnectionKey; + this.protocol = rpcConnectionKey.protocolClass; this.numRetries = numRetries; - - init(addr, initializer); } - public void init(InetSocketAddress addr, ChannelInitializer initializer) - throws ConnectTimeoutException { + // should be called from sub class + protected void init(ChannelInitializer initializer) { this.bootstrap = new Bootstrap(); this.bootstrap - .channel(NioSocketChannel.class) - .handler(initializer) - .option(ChannelOption.ALLOCATOR, PooledByteBufAllocator.DEFAULT) - .option(ChannelOption.SO_REUSEADDR, true) - .option(ChannelOption.CONNECT_TIMEOUT_MILLIS, 10000) - .option(ChannelOption.SO_RCVBUF, 1048576 * 10) - .option(ChannelOption.TCP_NODELAY, true); - - connect(addr); + .group(RpcChannelFactory.getSharedClientEventloopGroup()) + .channel(NioSocketChannel.class) + .handler(initializer) + .option(ChannelOption.ALLOCATOR, PooledByteBufAllocator.DEFAULT) + .option(ChannelOption.SO_REUSEADDR, true) + .option(ChannelOption.CONNECT_TIMEOUT_MILLIS, CONNECTION_TIMEOUT) + .option(ChannelOption.SO_RCVBUF, 1048576 * 10) + .option(ChannelOption.TCP_NODELAY, true); } - private void connectUsingNetty(InetSocketAddress address, GenericFutureListener listener) { + public RpcClientManager.RpcConnectionKey getKey() { + return key; + } - this.channelFuture = bootstrap.clone().group(RpcChannelFactory.getSharedClientEventloopGroup()) - .connect(address) - .addListener(listener); + protected final Class getServiceClass() throws ClassNotFoundException { + String serviceClassName = protocol.getName() + "$" + protocol.getSimpleName() + "Service"; + return Class.forName(serviceClassName); } - - private void handleConnectionInternally(final InetSocketAddress addr) throws ConnectTimeoutException { - final CountDownLatch latch = new CountDownLatch(1); - GenericFutureListener listener = new RetryConnectionListener(addr, latch); - connectUsingNetty(addr, listener); + @SuppressWarnings("unchecked") + protected final T getStub(Method stubMethod, Object rpcChannel) { try { - latch.await(CLIENT_CONNECTION_TIMEOUT_SEC, TimeUnit.SECONDS); - } catch (InterruptedException e) { - } - - if (!channelFuture.isSuccess()) { - throw new ConnectTimeoutException("Connect error to " + addr + - " caused by " + ExceptionUtils.getMessage(channelFuture.cause())); + return (T) stubMethod.invoke(null, rpcChannel); + } catch (Exception e) { + throw new RemoteException(e.getMessage(), e); } } - public void connect(InetSocketAddress addr) throws ConnectTimeoutException { - if(addr.isUnresolved()){ - addr = RpcUtils.createSocketAddr(addr.getHostName(), addr.getPort()); + public abstract T getStub(); + + + private InetSocketAddress resolveAddress(InetSocketAddress address) { + if (address.isUnresolved()) { + return RpcUtils.createSocketAddr(address.getHostName(), address.getPort()); } + return address; + } - handleConnectionInternally(addr); + private ChannelFuture doConnect(SocketAddress address) { + return this.channelFuture = bootstrap.clone().connect(address); } - class RetryConnectionListener implements GenericFutureListener { - private final AtomicInteger retryCount = new AtomicInteger(); - private final InetSocketAddress address; - private final CountDownLatch latch; - RetryConnectionListener(InetSocketAddress address, CountDownLatch latch) { - this.address = address; - this.latch = latch; + public synchronized void connect() throws ConnectTimeoutException { + if (isConnected()) return; + + final AtomicInteger retries = new AtomicInteger(); + InetSocketAddress address = key.addr; + if (address.isUnresolved()) { + address = resolveAddress(address); } - @Override - public void operationComplete(ChannelFuture channelFuture) throws Exception { - if (!channelFuture.isSuccess()) { - channelFuture.channel().close(); + /* do not call await() inside handler */ + ChannelFuture f = doConnect(address).awaitUninterruptibly(); + retries.incrementAndGet(); + + if (!f.isSuccess() && numRetries > 0) { + doReconnect(address, f, retries); + } + } - if (numRetries > retryCount.getAndIncrement()) { - final GenericFutureListener currentListener = this; + private void doReconnect(final InetSocketAddress address, ChannelFuture future, AtomicInteger retries) + throws ConnectTimeoutException { - RpcChannelFactory.getSharedClientEventloopGroup().schedule(new Runnable() { - @Override - public void run() { - connectUsingNetty(address, currentListener); - } - }, PAUSE, TimeUnit.MILLISECONDS); + for (; ; ) { + if (numRetries >= retries.getAndIncrement()) { - LOG.debug("Connecting to " + address + " has been failed. Retrying to connect."); + LOG.warn(future.cause().getMessage() + " Try to reconnect"); + try { + Thread.sleep(PAUSE); + } catch (InterruptedException e) { } - else { - latch.countDown(); - LOG.error("Max retry count has been exceeded. attempts=" + numRetries); + this.channelFuture = doConnect(address).awaitUninterruptibly(); + if (this.channelFuture.isDone() && this.channelFuture.isSuccess()) { + break; } - } - else { - latch.countDown(); + } else { + throw new ConnectTimeoutException("Max retry count has been exceeded. attempts=" + numRetries + + " caused by: " + future.cause()); } } } - public boolean isActive() { - return getChannel().isActive(); + public Channel getChannel() { + return channelFuture == null ? null : channelFuture.channel(); } - public InetSocketAddress getRemoteAddress() { - if (channelFuture == null || channelFuture.channel() == null) { - return null; - } - return (InetSocketAddress) channelFuture.channel().remoteAddress(); + public boolean isConnected() { + Channel channel = getChannel(); + return channel != null && channel.isOpen() && channel.isActive(); } - public Channel getChannel() { - return channelFuture.channel(); + public SocketAddress getRemoteAddress() { + Channel channel = getChannel(); + return channel == null ? null : channel.remoteAddress(); } @Override public void close() { - if (channelFuture != null && getChannel().isActive()) { - getChannel().close(); - } - - if (this.bootstrap != null) { - InetSocketAddress address = getRemoteAddress(); - if (address != null) { - LOG.debug("Proxy is disconnected from " + address.getHostName() + ":" + address.getPort()); - } + Channel channel = getChannel(); + if (channel != null && channel.isOpen()) { + LOG.debug("Proxy will be disconnected from remote " + channel.remoteAddress()); + channel.close().awaitUninterruptibly(); } } } diff --git a/tajo-rpc/src/main/java/org/apache/tajo/rpc/ProtoChannelInitializer.java b/tajo-rpc/src/main/java/org/apache/tajo/rpc/ProtoChannelInitializer.java index 6a340dcdde..74eb650450 100644 --- a/tajo-rpc/src/main/java/org/apache/tajo/rpc/ProtoChannelInitializer.java +++ b/tajo-rpc/src/main/java/org/apache/tajo/rpc/ProtoChannelInitializer.java @@ -18,6 +18,7 @@ package org.apache.tajo.rpc; +import com.google.protobuf.MessageLite; import io.netty.channel.Channel; import io.netty.channel.ChannelHandler; import io.netty.channel.ChannelInitializer; @@ -26,16 +27,21 @@ import io.netty.handler.codec.protobuf.ProtobufEncoder; import io.netty.handler.codec.protobuf.ProtobufVarint32FrameDecoder; import io.netty.handler.codec.protobuf.ProtobufVarint32LengthFieldPrepender; - -import com.google.protobuf.MessageLite; +import io.netty.handler.timeout.IdleStateHandler; class ProtoChannelInitializer extends ChannelInitializer { private final MessageLite defaultInstance; private final ChannelHandler handler; + private final int idleTimeSeconds; public ProtoChannelInitializer(ChannelHandler handler, MessageLite defaultInstance) { + this(handler, defaultInstance, 0); + } + + public ProtoChannelInitializer(ChannelHandler handler, MessageLite defaultInstance, int idleTimeSeconds) { this.handler = handler; this.defaultInstance = defaultInstance; + this.idleTimeSeconds = idleTimeSeconds; } @Override @@ -45,6 +51,7 @@ protected void initChannel(Channel channel) throws Exception { pipeline.addLast("protobufDecoder", new ProtobufDecoder(defaultInstance)); pipeline.addLast("frameEncoder", new ProtobufVarint32LengthFieldPrepender()); pipeline.addLast("protobufEncoder", new ProtobufEncoder()); + pipeline.addLast("idleStateHandler", new IdleStateHandler(0, 0, idleTimeSeconds)); //zero is disabling pipeline.addLast("handler", handler); } } diff --git a/tajo-rpc/src/main/java/org/apache/tajo/rpc/RpcClientManager.java b/tajo-rpc/src/main/java/org/apache/tajo/rpc/RpcClientManager.java new file mode 100644 index 0000000000..f05fb97c23 --- /dev/null +++ b/tajo-rpc/src/main/java/org/apache/tajo/rpc/RpcClientManager.java @@ -0,0 +1,185 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.rpc; + +import io.netty.channel.ConnectTimeoutException; +import io.netty.util.internal.logging.CommonsLoggerFactory; +import io.netty.util.internal.logging.InternalLoggerFactory; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import javax.annotation.concurrent.ThreadSafe; +import java.net.InetSocketAddress; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +@ThreadSafe +public class RpcClientManager { + private static final Log LOG = LogFactory.getLog(RpcClientManager.class); + + public static final int RPC_RETRIES = 3; + + /* If all requests is done and client is idle state, client will be removed. */ + public static final int RPC_IDLE_TIMEOUT = 43200; // 12 hour + + /* entries will be removed by ConnectionCloseFutureListener */ + private static final Map + clients = Collections.synchronizedMap(new HashMap()); + + private static RpcClientManager instance; + + static { + InternalLoggerFactory.setDefaultFactory(new CommonsLoggerFactory()); + instance = new RpcClientManager(); + } + + private RpcClientManager() { + } + + public static RpcClientManager getInstance() { + return instance; + } + + private NettyClientBase makeClient(RpcConnectionKey rpcConnectionKey) + throws NoSuchMethodException, ClassNotFoundException, ConnectTimeoutException { + NettyClientBase client; + if (rpcConnectionKey.asyncMode) { + client = new AsyncRpcClient(rpcConnectionKey, RPC_RETRIES, RPC_IDLE_TIMEOUT); + } else { + client = new BlockingRpcClient(rpcConnectionKey, RPC_RETRIES, RPC_IDLE_TIMEOUT); + } + return client; + } + + /** + * Connect a {@link NettyClientBase} to the remote {@link NettyServerBase}, and returns rpc client by protocol. + * This client will be shared per protocol and address. Client is removed in shared map when a client is closed + * @param addr + * @param protocolClass + * @param asyncMode + * @return + * @throws NoSuchMethodException + * @throws ClassNotFoundException + * @throws ConnectTimeoutException + */ + public NettyClientBase getClient(InetSocketAddress addr, + Class protocolClass, boolean asyncMode) + throws NoSuchMethodException, ClassNotFoundException, ConnectTimeoutException { + RpcConnectionKey key = new RpcConnectionKey(addr, protocolClass, asyncMode); + + NettyClientBase client; + synchronized (clients) { + client = clients.get(key); + if (client == null) { + clients.put(key, client = makeClient(key)); + } + } + + if (!client.isConnected()) { + client.connect(); + client.getChannel().closeFuture().addListener(new ConnectionCloseFutureListener(key)); + } + assert client.isConnected(); + return client; + } + + /** + * Request to close this clients + * After it is closed, it is removed from clients map. + */ + public static void close() { + LOG.info("Closing RPC client manager"); + + for (NettyClientBase eachClient : clients.values()) { + try { + eachClient.close(); + } catch (Exception e) { + LOG.error(e.getMessage(), e); + } + } + } + + /** + * Close client manager and shutdown Netty RPC worker pool + * After it is shutdown it is not possible to reuse it again. + */ + public static void shutdown() { + close(); + RpcChannelFactory.shutdownGracefully(); + } + + protected static NettyClientBase remove(RpcConnectionKey key) { + LOG.debug("Removing shared rpc client :" + key); + return clients.remove(key); + } + + protected static boolean contains(RpcConnectionKey key) { + return clients.containsKey(key); + } + + public static void cleanup(NettyClientBase... clients) { + for (NettyClientBase client : clients) { + if (client != null) { + try { + client.close(); + } catch (Exception e) { + if (LOG.isDebugEnabled()) { + LOG.debug("Exception in closing " + client.getKey(), e); + } + } + } + } + } + + static class RpcConnectionKey { + final InetSocketAddress addr; + final Class protocolClass; + final boolean asyncMode; + + final String description; + + public RpcConnectionKey(InetSocketAddress addr, + Class protocolClass, boolean asyncMode) { + this.addr = addr; + this.protocolClass = protocolClass; + this.asyncMode = asyncMode; + this.description = "[" + protocolClass + "] " + addr + "," + asyncMode; + } + + @Override + public String toString() { + return description; + } + + @Override + public boolean equals(Object obj) { + if (!(obj instanceof RpcConnectionKey)) { + return false; + } + + return toString().equals(obj.toString()); + } + + @Override + public int hashCode() { + return description.hashCode(); + } + } +} diff --git a/tajo-rpc/src/main/java/org/apache/tajo/rpc/RpcConnectionPool.java b/tajo-rpc/src/main/java/org/apache/tajo/rpc/RpcConnectionPool.java deleted file mode 100644 index 43feeb1eab..0000000000 --- a/tajo-rpc/src/main/java/org/apache/tajo/rpc/RpcConnectionPool.java +++ /dev/null @@ -1,194 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.rpc; - -import com.google.common.base.Objects; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import io.netty.channel.ConnectTimeoutException; -import io.netty.channel.group.ChannelGroup; -import io.netty.channel.group.DefaultChannelGroup; -import io.netty.util.concurrent.GlobalEventExecutor; -import io.netty.util.internal.logging.CommonsLoggerFactory; -import io.netty.util.internal.logging.InternalLoggerFactory; - -import java.net.InetSocketAddress; -import java.util.HashMap; -import java.util.Map; - -public class RpcConnectionPool { - private static final Log LOG = LogFactory.getLog(RpcConnectionPool.class); - - private Map connections = - new HashMap(); - private ChannelGroup accepted = new DefaultChannelGroup(GlobalEventExecutor.INSTANCE); - - private static RpcConnectionPool instance; - private final Object lockObject = new Object(); - - public final static int RPC_RETRIES = 3; - - private RpcConnectionPool() { - } - - public synchronized static RpcConnectionPool getPool() { - if(instance == null) { - InternalLoggerFactory.setDefaultFactory(new CommonsLoggerFactory()); - instance = new RpcConnectionPool(); - } - return instance; - } - - private NettyClientBase makeConnection(RpcConnectionKey rpcConnectionKey) - throws NoSuchMethodException, ClassNotFoundException, ConnectTimeoutException { - NettyClientBase client; - if(rpcConnectionKey.asyncMode) { - client = new AsyncRpcClient(rpcConnectionKey.protocolClass, rpcConnectionKey.addr, - RPC_RETRIES); - } else { - client = new BlockingRpcClient(rpcConnectionKey.protocolClass, rpcConnectionKey.addr, - RPC_RETRIES); - } - accepted.add(client.getChannel()); - return client; - } - - public NettyClientBase getConnection(InetSocketAddress addr, - Class protocolClass, boolean asyncMode) - throws NoSuchMethodException, ClassNotFoundException, ConnectTimeoutException { - RpcConnectionKey key = new RpcConnectionKey(addr, protocolClass, asyncMode); - NettyClientBase client = connections.get(key); - - if (client == null) { - synchronized (lockObject){ - client = connections.get(key); - if (client == null) { - client = makeConnection(key); - connections.put(key, client); - } - } - } - - if (client.getChannel() == null || !client.getChannel().isOpen() || !client.getChannel().isActive()) { - LOG.warn("Try to reconnect : " + addr); - client.connect(addr); - } - return client; - } - - public void releaseConnection(NettyClientBase client) { - if (client == null) return; - - try { - synchronized (lockObject) { - if (!client.getChannel().isOpen()) { - connections.remove(client.getKey()); - client.close(); - } - } - - if(LOG.isDebugEnabled()) { - LOG.debug("Current Connections [" + connections.size() + "] Accepted: " + accepted.size()); - - } - } catch (Exception e) { - LOG.error("Can't close connection:" + client.getKey() + ":" + e.getMessage(), e); - } - } - - public void closeConnection(NettyClientBase client) { - if (client == null) { - return; - } - - try { - if(LOG.isDebugEnabled()) { - LOG.debug("Close connection [" + client.getKey() + "]"); - } - - synchronized (lockObject) { - connections.remove(client.getKey()); - client.close(); - } - - } catch (Exception e) { - LOG.error("Can't close connection:" + client.getKey() + ":" + e.getMessage(), e); - } - } - - public synchronized void close() { - if(LOG.isDebugEnabled()) { - LOG.debug("Pool Closed"); - } - synchronized(lockObject) { - for(NettyClientBase eachClient: connections.values()) { - try { - eachClient.close(); - } catch (Exception e) { - LOG.error("close client pool error", e); - } - } - - connections.clear(); - } - - try { - accepted.close(); - } catch (Throwable t) { - LOG.error(t, t); - } - } - - public synchronized void shutdown(){ - close(); - RpcChannelFactory.shutdownGracefully(); - } - - static class RpcConnectionKey { - final InetSocketAddress addr; - final Class protocolClass; - final boolean asyncMode; - - public RpcConnectionKey(InetSocketAddress addr, - Class protocolClass, boolean asyncMode) { - this.addr = addr; - this.protocolClass = protocolClass; - this.asyncMode = asyncMode; - } - - @Override - public String toString() { - return "["+ protocolClass + "] " + addr + "," + asyncMode; - } - - @Override - public boolean equals(Object obj) { - if(!(obj instanceof RpcConnectionKey)) { - return false; - } - - return toString().equals(obj.toString()); - } - - @Override - public int hashCode() { - return Objects.hashCode(addr, asyncMode); - } - } -} diff --git a/tajo-rpc/src/main/java/org/apache/tajo/rpc/ServerCallable.java b/tajo-rpc/src/main/java/org/apache/tajo/rpc/ServerCallable.java index fb1cec253b..2804a03132 100644 --- a/tajo-rpc/src/main/java/org/apache/tajo/rpc/ServerCallable.java +++ b/tajo-rpc/src/main/java/org/apache/tajo/rpc/ServerCallable.java @@ -18,13 +18,11 @@ package org.apache.tajo.rpc; +import com.google.protobuf.ServiceException; + import java.io.IOException; import java.lang.reflect.UndeclaredThrowableException; import java.net.InetSocketAddress; -import java.util.ArrayList; -import java.util.List; - -import com.google.protobuf.ServiceException; public abstract class ServerCallable { protected InetSocketAddress addr; @@ -33,21 +31,16 @@ public abstract class ServerCallable { protected Class protocol; protected boolean asyncMode; protected boolean closeConn; - protected RpcConnectionPool connPool; + protected RpcClientManager manager; public abstract T call(NettyClientBase client) throws Exception; - public ServerCallable(RpcConnectionPool connPool, InetSocketAddress addr, Class protocol, boolean asyncMode) { - this(connPool, addr, protocol, asyncMode, false); - } - - public ServerCallable(RpcConnectionPool connPool, InetSocketAddress addr, Class protocol, - boolean asyncMode, boolean closeConn) { - this.connPool = connPool; + public ServerCallable(RpcClientManager manager, InetSocketAddress addr, Class protocol, + boolean asyncMode) { + this.manager = manager; this.addr = addr; this.protocol = protocol; this.asyncMode = asyncMode; - this.closeConn = closeConn; } public void beforeCall() { @@ -74,26 +67,24 @@ public void abort() { * Run this instance with retries, timed waits, * and refinds of missing regions. * - * @param the type of the return value * @return an object of type T * @throws com.google.protobuf.ServiceException if a remote or network exception occurs */ + public T withRetries() throws ServiceException { //TODO configurable final long pause = 500; //ms final int numRetries = 3; - List exceptions = new ArrayList(); for (int tries = 0; tries < numRetries; tries++) { NettyClientBase client = null; try { beforeCall(); if(addr != null) { - client = connPool.getConnection(addr, protocol, asyncMode); + client = manager.getClient(addr, protocol, asyncMode); } return call(client); } catch (IOException ioe) { - exceptions.add(ioe); if(abort) { throw new ServiceException(ioe.getMessage(), ioe); } @@ -105,9 +96,7 @@ public T withRetries() throws ServiceException { } finally { afterCall(); if(closeConn) { - connPool.closeConnection(client); - } else { - connPool.releaseConnection(client); + RpcClientManager.cleanup(client); } } try { @@ -122,7 +111,6 @@ public T withRetries() throws ServiceException { /** * Run this instance against the server once. - * @param the type of the return value * @return an object of type T * @throws java.io.IOException if a remote or network exception occurs * @throws RuntimeException other unspecified error @@ -131,7 +119,7 @@ public T withoutRetries() throws IOException, RuntimeException { NettyClientBase client = null; try { beforeCall(); - client = connPool.getConnection(addr, protocol, asyncMode); + client = manager.getClient(addr, protocol, asyncMode); return call(client); } catch (Throwable t) { Throwable t2 = translateException(t); @@ -143,9 +131,7 @@ public T withoutRetries() throws IOException, RuntimeException { } finally { afterCall(); if(closeConn) { - connPool.closeConnection(client); - } else { - connPool.releaseConnection(client); + RpcClientManager.cleanup(client); } } } diff --git a/tajo-rpc/src/test/java/org/apache/tajo/rpc/TestAsyncRpc.java b/tajo-rpc/src/test/java/org/apache/tajo/rpc/TestAsyncRpc.java index 31d52655c4..1e4959b6b5 100644 --- a/tajo-rpc/src/test/java/org/apache/tajo/rpc/TestAsyncRpc.java +++ b/tajo-rpc/src/test/java/org/apache/tajo/rpc/TestAsyncRpc.java @@ -19,6 +19,7 @@ package org.apache.tajo.rpc; import com.google.protobuf.RpcCallback; +import io.netty.channel.ConnectTimeoutException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.tajo.rpc.test.DummyProtocol; @@ -34,8 +35,6 @@ import org.junit.runner.Description; import org.junit.runners.model.Statement; -import io.netty.channel.ConnectTimeoutException; - import java.lang.annotation.ElementType; import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; @@ -60,17 +59,17 @@ public class TestAsyncRpc { Interface stub; DummyProtocolAsyncImpl service; int retries; - + @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.METHOD) @interface SetupRpcConnection { boolean setupRpcServer() default true; boolean setupRpcClient() default true; } - + @Rule public ExternalResource resource = new ExternalResource() { - + private Description description; @Override @@ -86,7 +85,7 @@ protected void before() throws Throwable { if (setupRpcConnection == null || setupRpcConnection.setupRpcServer()) { setUpRpcServer(); } - + if (setupRpcConnection == null || setupRpcConnection.setupRpcClient()) { setUpRpcClient(); } @@ -103,7 +102,7 @@ protected void after() { fail(e.getMessage()); } } - + if (setupRpcConnection == null || setupRpcConnection.setupRpcServer()) { try { tearDownRpcServer(); @@ -112,21 +111,25 @@ protected void after() { } } } - + }; - + public void setUpRpcServer() throws Exception { service = new DummyProtocolAsyncImpl(); server = new AsyncRpcServer(DummyProtocol.class, service, new InetSocketAddress("127.0.0.1", 0), 2); server.start(); } - + public void setUpRpcClient() throws Exception { retries = 1; - client = new AsyncRpcClient(DummyProtocol.class, - RpcUtils.getConnectAddress(server.getListenAddress()), retries); + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey( + RpcUtils.getConnectAddress(server.getListenAddress()), + DummyProtocol.class, true); + client = new AsyncRpcClient(rpcConnectionKey, retries); + client.connect(); stub = client.getStub(); } @@ -134,14 +137,14 @@ public void setUpRpcClient() throws Exception { public static void tearDownClass() throws Exception { RpcChannelFactory.shutdownGracefully(); } - + public void tearDownRpcServer() throws Exception { if(server != null) { server.shutdown(); server = null; } } - + public void tearDownRpcClient() throws Exception { if(client != null) { client.close(); @@ -296,7 +299,11 @@ public void run() { }); serverThread.start(); - client = new AsyncRpcClient(DummyProtocol.class, address, retries); + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey(address, DummyProtocol.class, true); + client = new AsyncRpcClient(rpcConnectionKey, retries); + client.connect(); + assertTrue(client.isConnected()); stub = client.getStub(); stub.echo(future.getController(), echoMessage, future); @@ -310,7 +317,10 @@ public void testConnectionFailure() throws Exception { InetSocketAddress address = new InetSocketAddress("test", 0); boolean expected = false; try { - new AsyncRpcClient(DummyProtocol.class, address, retries); + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey(address, DummyProtocol.class, true); + NettyClientBase client = new AsyncRpcClient(rpcConnectionKey, retries); + client.connect(); fail(); } catch (ConnectTimeoutException e) { expected = true; @@ -318,14 +328,19 @@ public void testConnectionFailure() throws Exception { fail(); } assertTrue(expected); + } @Test @SetupRpcConnection(setupRpcClient=false) public void testUnresolvedAddress() throws Exception { String hostAndPort = RpcUtils.normalizeInetSocketAddress(server.getListenAddress()); - client = new AsyncRpcClient(DummyProtocol.class, - RpcUtils.createUnresolved(hostAndPort), retries); + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey( + RpcUtils.createUnresolved(hostAndPort), DummyProtocol.class, true); + client = new AsyncRpcClient(rpcConnectionKey, retries); + client.connect(); + assertTrue(client.isConnected()); Interface stub = client.getStub(); EchoMessage echoMessage = EchoMessage.newBuilder() .setMessage(MESSAGE).build(); @@ -336,4 +351,43 @@ public void testUnresolvedAddress() throws Exception { assertEquals(future.get(), echoMessage); assertTrue(future.isDone()); } + + @Test + public void testIdleTimeout() throws Exception { + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey(server.getListenAddress(), DummyProtocol.class, true); + AsyncRpcClient client = new AsyncRpcClient(rpcConnectionKey, retries, 1); //1 sec idle timeout + client.connect(); + assertTrue(client.isConnected()); + + Thread.sleep(2000); + assertFalse(client.isConnected()); + + client.connect(); // try to reconnect + assertTrue(client.isConnected()); + client.close(); + assertFalse(client.isConnected()); + } + + @Test + public void testIdleTimeoutWithActiveRequest() throws Exception { + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey(server.getListenAddress(), DummyProtocol.class, true); + AsyncRpcClient client = new AsyncRpcClient(rpcConnectionKey, retries, 1); //1 sec idle timeout + client.connect(); + + assertTrue(client.isConnected()); + Interface stub = client.getStub(); + EchoMessage echoMessage = EchoMessage.newBuilder() + .setMessage(MESSAGE).build(); + CallFuture future = new CallFuture(); + stub.deley(null, echoMessage, future); //3 sec delay + + assertFalse(future.isDone()); + assertEquals(future.get(), echoMessage); + assertTrue(future.isDone()); + + Thread.sleep(2000); + assertFalse(client.isConnected()); + } } \ No newline at end of file diff --git a/tajo-rpc/src/test/java/org/apache/tajo/rpc/TestBlockingRpc.java b/tajo-rpc/src/test/java/org/apache/tajo/rpc/TestBlockingRpc.java index 07e2dcaff0..8c0b475a93 100644 --- a/tajo-rpc/src/test/java/org/apache/tajo/rpc/TestBlockingRpc.java +++ b/tajo-rpc/src/test/java/org/apache/tajo/rpc/TestBlockingRpc.java @@ -18,6 +18,7 @@ package org.apache.tajo.rpc; +import io.netty.channel.ConnectTimeoutException; import org.apache.tajo.rpc.test.DummyProtocol; import org.apache.tajo.rpc.test.DummyProtocol.DummyProtocolService.BlockingInterface; import org.apache.tajo.rpc.test.TestProtos.EchoMessage; @@ -35,7 +36,6 @@ import java.lang.annotation.Retention; import java.lang.annotation.RetentionPolicy; import java.lang.annotation.Target; -import java.net.ConnectException; import java.net.InetSocketAddress; import java.net.ServerSocket; import java.util.concurrent.CountDownLatch; @@ -51,17 +51,17 @@ public class TestBlockingRpc { private BlockingInterface stub; private DummyProtocolBlockingImpl service; private int retries; - + @Retention(RetentionPolicy.RUNTIME) @Target(ElementType.METHOD) @interface SetupRpcConnection { boolean setupRpcServer() default true; boolean setupRpcClient() default true; } - + @Rule public ExternalResource resource = new ExternalResource() { - + private Description description; @Override @@ -73,11 +73,11 @@ public Statement apply(Statement base, Description description) { @Override protected void before() throws Throwable { SetupRpcConnection setupRpcConnection = description.getAnnotation(SetupRpcConnection.class); - + if (setupRpcConnection == null || setupRpcConnection.setupRpcServer()) { setUpRpcServer(); } - + if (setupRpcConnection == null || setupRpcConnection.setupRpcClient()) { setUpRpcClient(); } @@ -86,7 +86,7 @@ protected void before() throws Throwable { @Override protected void after() { SetupRpcConnection setupRpcConnection = description.getAnnotation(SetupRpcConnection.class); - + if (setupRpcConnection == null || setupRpcConnection.setupRpcClient()) { try { tearDownRpcClient(); @@ -94,7 +94,7 @@ protected void after() { fail(e.getMessage()); } } - + if (setupRpcConnection == null || setupRpcConnection.setupRpcServer()) { try { tearDownRpcServer(); @@ -103,21 +103,26 @@ protected void after() { } } } - + }; - + public void setUpRpcServer() throws Exception { service = new DummyProtocolBlockingImpl(); server = new BlockingRpcServer(DummyProtocol.class, service, new InetSocketAddress("127.0.0.1", 0), 2); server.start(); } - + public void setUpRpcClient() throws Exception { retries = 1; - client = new BlockingRpcClient(DummyProtocol.class, - RpcUtils.getConnectAddress(server.getListenAddress()), retries); + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey( + RpcUtils.getConnectAddress(server.getListenAddress()), + DummyProtocol.class, false); + client = new BlockingRpcClient(rpcConnectionKey, retries); + client.connect(); + assertTrue(client.isConnected()); stub = client.getStub(); } @@ -125,14 +130,14 @@ public void setUpRpcClient() throws Exception { public static void tearDownClass() throws Exception { RpcChannelFactory.shutdownGracefully(); } - + public void tearDownRpcServer() throws Exception { if(server != null) { server.shutdown(); server = null; } } - + public void tearDownRpcClient() throws Exception { if(client != null) { client.close(); @@ -159,7 +164,7 @@ public void testRpc() throws Exception { @Test @SetupRpcConnection(setupRpcClient=false) public void testRpcWithServiceCallable() throws Exception { - RpcConnectionPool pool = RpcConnectionPool.getPool(); + RpcClientManager manager = RpcClientManager.getInstance(); final SumRequest request = SumRequest.newBuilder() .setX1(1) .setX2(2) @@ -167,20 +172,20 @@ public void testRpcWithServiceCallable() throws Exception { .setX4(2.0f).build(); SumResponse response = - new ServerCallable(pool, - server.getListenAddress(), DummyProtocol.class, false) { - @Override - public SumResponse call(NettyClientBase client) throws Exception { - BlockingInterface stub2 = client.getStub(); - SumResponse response1 = stub2.sum(null, request); - return response1; - } - }.withRetries(); + new ServerCallable(manager, + server.getListenAddress(), DummyProtocol.class, false) { + @Override + public SumResponse call(NettyClientBase client) throws Exception { + BlockingInterface stub2 = client.getStub(); + SumResponse response1 = stub2.sum(null, request); + return response1; + } + }.withRetries(); assertEquals(8.15d, response.getResult(), 1e-15); response = - new ServerCallable(pool, + new ServerCallable(manager, server.getListenAddress(), DummyProtocol.class, false) { @Override public SumResponse call(NettyClientBase client) throws Exception { @@ -191,7 +196,7 @@ public SumResponse call(NettyClientBase client) throws Exception { }.withoutRetries(); assertTrue(8.15d == response.getResult()); - pool.close(); + RpcClientManager.close(); } @Test @@ -212,6 +217,22 @@ public void testThrowException() throws Exception { } } + @Test + public void testThrowException2() throws Exception { + EchoMessage message = EchoMessage.newBuilder() + .setMessage(MESSAGE).build(); + + try { + stub.throwException(null, message); + fail("RpcCall should throw exception"); + } catch (Throwable t) { + assertTrue(t instanceof TajoServiceException); + } + + EchoMessage message1 = stub.deley(null, message); + assertEquals(message, message1); + } + @Test @SetupRpcConnection(setupRpcServer=false,setupRpcClient=false) public void testConnectionRetry() throws Exception { @@ -238,7 +259,11 @@ public void run() { }); serverThread.start(); - client = new BlockingRpcClient(DummyProtocol.class, address, retries); + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey(address, DummyProtocol.class, false); + client = new BlockingRpcClient(rpcConnectionKey, retries); + client.connect(); + assertTrue(client.isConnected()); stub = client.getStub(); EchoMessage response = stub.echo(null, message); @@ -247,22 +272,21 @@ public void run() { @Test public void testConnectionFailed() throws Exception { - boolean expected = false; NettyClientBase client = null; - + boolean expected = false; try { int port = server.getListenAddress().getPort() + 1; - client = new BlockingRpcClient(DummyProtocol.class, - RpcUtils.getConnectAddress(new InetSocketAddress("127.0.0.1", port)), retries); - client.close(); - fail("Connection should be failed."); - } catch (ConnectException ce) { - expected = true; - } catch (Throwable ce){ - if (client != null) { - client.close(); - } + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey( + RpcUtils.getConnectAddress(new InetSocketAddress("127.0.0.1", port)), + DummyProtocol.class, false); + client = new BlockingRpcClient(rpcConnectionKey, retries); + client.connect(); fail(); + } catch (ConnectTimeoutException e) { + expected = true; + } catch (Throwable e) { + fail(e.getMessage()); } assertTrue(expected); } @@ -329,8 +353,12 @@ public void run() { @SetupRpcConnection(setupRpcClient=false) public void testUnresolvedAddress() throws Exception { String hostAndPort = RpcUtils.normalizeInetSocketAddress(server.getListenAddress()); - client = new BlockingRpcClient(DummyProtocol.class, - RpcUtils.createUnresolved(hostAndPort), retries); + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey( + RpcUtils.createUnresolved(hostAndPort), DummyProtocol.class, false); + client = new BlockingRpcClient(rpcConnectionKey, retries); + client.connect(); + assertTrue(client.isConnected()); BlockingInterface stub = client.getStub(); EchoMessage message = EchoMessage.newBuilder() @@ -338,4 +366,41 @@ public void testUnresolvedAddress() throws Exception { EchoMessage response2 = stub.echo(null, message); assertEquals(MESSAGE, response2.getMessage()); } + + @Test + public void testIdleTimeout() throws Exception { + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey(server.getListenAddress(), DummyProtocol.class, false); + BlockingRpcClient client = new BlockingRpcClient(rpcConnectionKey, retries, 1); //1 sec idle timeout + client.connect(); + assertTrue(client.isConnected()); + + Thread.sleep(2000); + assertFalse(client.isConnected()); + + client.connect(); // try to reconnect + assertTrue(client.isConnected()); + client.close(); + assertFalse(client.isConnected()); + } + + @Test + public void testIdleTimeoutWithActiveRequest() throws Exception { + RpcClientManager.RpcConnectionKey rpcConnectionKey = + new RpcClientManager.RpcConnectionKey(server.getListenAddress(), DummyProtocol.class, false); + BlockingRpcClient client = new BlockingRpcClient(rpcConnectionKey, retries, 1); //1 sec idle timeout + + client.connect(); + + assertTrue(client.isConnected()); + BlockingInterface stub = client.getStub(); + EchoMessage echoMessage = EchoMessage.newBuilder() + .setMessage(MESSAGE).build(); + + EchoMessage message = stub.deley(null, echoMessage); //3 sec delay + assertEquals(message, echoMessage); + + Thread.sleep(2000); + assertFalse(client.isConnected()); + } } \ No newline at end of file diff --git a/tajo-rpc/src/test/java/org/apache/tajo/rpc/TestRpcClientManager.java b/tajo-rpc/src/test/java/org/apache/tajo/rpc/TestRpcClientManager.java new file mode 100644 index 0000000000..5f86518b6a --- /dev/null +++ b/tajo-rpc/src/test/java/org/apache/tajo/rpc/TestRpcClientManager.java @@ -0,0 +1,97 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.rpc; + +import org.apache.tajo.rpc.test.DummyProtocol; +import org.apache.tajo.rpc.test.impl.DummyProtocolAsyncImpl; +import org.junit.Test; + +import java.net.InetSocketAddress; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +public class TestRpcClientManager { + + @Test + public void testRaceCondition() throws Exception { + final int parallelCount = 50; + final DummyProtocolAsyncImpl service = new DummyProtocolAsyncImpl(); + NettyServerBase server = new AsyncRpcServer(DummyProtocol.class, + service, new InetSocketAddress("127.0.0.1", 0), parallelCount); + server.start(); + + final InetSocketAddress address = server.getListenAddress(); + final RpcClientManager manager = RpcClientManager.getInstance(); + + ExecutorService executor = Executors.newFixedThreadPool(parallelCount); + List tasks = new ArrayList(); + for (int i = 0; i < parallelCount; i++) { + tasks.add(executor.submit(new Runnable() { + @Override + public void run() { + NettyClientBase client = null; + try { + client = manager.getClient(address, DummyProtocol.class, false); + } catch (Throwable e) { + fail(e.getMessage()); + } + assertTrue(client.isConnected()); + } + }) + ); + } + + for (Future future : tasks) { + future.get(); + } + + NettyClientBase clientBase = manager.getClient(address, DummyProtocol.class, false); + RpcClientManager.cleanup(clientBase); + server.shutdown(); + executor.shutdown(); + } + + @Test + public void testCloseFuture() throws Exception { + final DummyProtocolAsyncImpl service = new DummyProtocolAsyncImpl(); + NettyServerBase server = new AsyncRpcServer(DummyProtocol.class, + service, new InetSocketAddress("127.0.0.1", 0), 3); + server.start(); + + final RpcClientManager manager = RpcClientManager.getInstance(); + + NettyClientBase client = manager.getClient(server.getListenAddress(), DummyProtocol.class, true); + assertTrue(client.isConnected()); + assertTrue(client.getChannel().isWritable()); + + RpcClientManager.RpcConnectionKey key = client.getKey(); + assertTrue(RpcClientManager.contains(key)); + + client.close(); + assertFalse(RpcClientManager.contains(key)); + server.shutdown(); + } +} \ No newline at end of file From 3722599d72c012d508cf5201cc101111affa764e Mon Sep 17 00:00:00 2001 From: "navis.ryu" Date: Mon, 20 Apr 2015 20:37:59 +0900 Subject: [PATCH 019/141] TAJO-1481: Numeric conversion of Inet4 type should be considered as unsigned. Signed-off-by: Jihoon Son --- CHANGES | 3 +++ .../src/main/java/org/apache/tajo/datum/Inet4Datum.java | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index e9b5886a0b..ca983e095f 100644 --- a/CHANGES +++ b/CHANGES @@ -27,6 +27,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1481: Numeric conversion of Inet4 type should be considered as unsigned. + (Contributed by navis, Committed by jihoon) + TAJO-1522: NPE making stage history before task scheduler is initialized. (Contributed by navis, Committed by jinho) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/Inet4Datum.java b/tajo-common/src/main/java/org/apache/tajo/datum/Inet4Datum.java index 1de81cd9f5..ab1799bac4 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/Inet4Datum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/Inet4Datum.java @@ -19,6 +19,7 @@ package org.apache.tajo.datum; import com.google.common.base.Preconditions; +import com.google.common.primitives.UnsignedInteger; import com.google.gson.annotations.Expose; import org.apache.tajo.exception.InvalidOperationException; import org.apache.tajo.util.Bytes; @@ -68,7 +69,7 @@ public int asInt4() { @Override public long asInt8() { - return this.address; + return UnsignedInteger.asUnsigned(address).longValue(); } @Override From 8d7988438dafd35068c52031393a9fb1c040bdf5 Mon Sep 17 00:00:00 2001 From: DaeMyung Kang Date: Tue, 21 Apr 2015 06:18:37 +0900 Subject: [PATCH 020/141] TAJO-1419: Tsql session command doesn't work. Signed-off-by: Jihoon Son --- CHANGES | 3 ++ .../java/org/apache/tajo/SessionVars.java | 2 +- .../org/apache/tajo/cli/tsql/TestTajoCli.java | 40 +++++++++++++++++++ .../TestTajoCli/testHelpSessionVars.result | 2 +- 4 files changed, 45 insertions(+), 2 deletions(-) diff --git a/CHANGES b/CHANGES index ca983e095f..a1a14c4cdb 100644 --- a/CHANGES +++ b/CHANGES @@ -27,6 +27,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1419: Tsql session command doesn't work. (Contributed by DaeMyung Kang, + Committed by jihoon) + TAJO-1481: Numeric conversion of Inet4 type should be considered as unsigned. (Contributed by navis, Committed by jihoon) diff --git a/tajo-common/src/main/java/org/apache/tajo/SessionVars.java b/tajo-common/src/main/java/org/apache/tajo/SessionVars.java index b3233edc10..15ee73a80b 100644 --- a/tajo-common/src/main/java/org/apache/tajo/SessionVars.java +++ b/tajo-common/src/main/java/org/apache/tajo/SessionVars.java @@ -71,7 +71,7 @@ public enum SessionVars implements ConfigKey { ON_ERROR_STOP(ConfVars.$CLI_ERROR_STOP, "tsql will exist if an error occurs.", CLI_SIDE_VAR), // Timezone & Date ---------------------------------------------------------- - TIMEZONE(ConfVars.$TIMEZONE, "Sets timezone", CLI_SIDE_VAR), + TIMEZONE(ConfVars.$TIMEZONE, "Sets timezone", DEFAULT), DATE_ORDER(ConfVars.$DATE_ORDER, "date order (default is YMD)", CLI_SIDE_VAR), // Locales and Character set ------------------------------------------------ diff --git a/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestTajoCli.java b/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestTajoCli.java index d4a5a1f840..487c497a11 100644 --- a/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestTajoCli.java +++ b/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestTajoCli.java @@ -350,6 +350,46 @@ public void testHelpSessionVars() throws Exception { assertOutputResult(new String(out.toByteArray())); } + @Test + public void testTimeZoneSessionVars1() throws Exception { + tajoCli.executeMetaCommand("\\set TIMEZONE GMT+1"); + tajoCli.executeMetaCommand("\\set"); + String output = new String(out.toByteArray()); + assertTrue(output.contains("'TIMEZONE'='GMT+1'")); + } + + @Test + public void testTimeZoneSessionVars2() throws Exception { + tajoCli.executeScript("SET TIME ZONE 'GMT+2'"); + tajoCli.executeMetaCommand("\\set"); + String output = new String(out.toByteArray()); + assertTrue(output.contains("'TIMEZONE'='GMT+2'")); + } + + @Test + public void testTimeZoneTest1() throws Exception { + String tableName = "test1"; + tajoCli.executeMetaCommand("\\set TIMEZONE GMT+0"); + tajoCli.executeScript("create table " + tableName + " (col1 TIMESTAMP)"); + tajoCli.executeScript("insert into " + tableName + " select to_timestamp(0)"); + tajoCli.executeScript("select * from " + tableName); + String consoleResult = new String(out.toByteArray()); + tajoCli.executeScript("DROP TABLE " + tableName + " PURGE"); + assertTrue(consoleResult.contains("1970-01-01 00:00:00")); + } + + @Test + public void testTimeZoneTest2() throws Exception { + String tableName = "test1"; + tajoCli.executeMetaCommand("\\set TIMEZONE GMT+1"); + tajoCli.executeScript("create table " + tableName + " (col1 TIMESTAMP)"); + tajoCli.executeScript("insert into " + tableName + " select to_timestamp(0)"); + tajoCli.executeScript("select * from " + tableName); + String consoleResult = new String(out.toByteArray()); + tajoCli.executeScript("DROP TABLE " + tableName + " PURGE"); + assertTrue(consoleResult.contains("1970-01-01 01:00:00")); + } + @Test(timeout = 3000) public void testNonForwardQueryPause() throws Exception { final String sql = "select * from default.lineitem"; diff --git a/tajo-core/src/test/resources/results/TestTajoCli/testHelpSessionVars.result b/tajo-core/src/test/resources/results/TestTajoCli/testHelpSessionVars.result index b5b7c229c9..bcd897078c 100644 --- a/tajo-core/src/test/resources/results/TestTajoCli/testHelpSessionVars.result +++ b/tajo-core/src/test/resources/results/TestTajoCli/testHelpSessionVars.result @@ -36,4 +36,4 @@ Available Session Variables: \set CODEGEN [true or false] - Runtime code generation enabled (experiment) \set ARITHABORT [true or false] - If true, a running query will be terminated when an overflow or divide-by-zero occurs. \set FETCH_ROWNUM [int value] - Sets the number of rows at a time from Master -\set DEBUG_ENABLED [true or false] - (debug only) debug mode enabled \ No newline at end of file +\set DEBUG_ENABLED [true or false] - (debug only) debug mode enabled From c33b8630ff92eb6e0b123735332c82c2c262bbee Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 24 Apr 2015 11:13:08 +0900 Subject: [PATCH 021/141] TAJO-1575: HBASE_HOME guidance is duplicated in tajo-env.sh. Signed-off-by: Jihoon Son --- CHANGES | 3 +++ tajo-dist/src/main/conf/tajo-env.sh | 5 +---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGES b/CHANGES index a1a14c4cdb..831967b914 100644 --- a/CHANGES +++ b/CHANGES @@ -89,6 +89,9 @@ Release 0.10.1 - unreleased TASKS + TAJO-1575: HBASE_HOME guidance is duplicated in tajo-env.sh. + (Contributed by Jongyoung Park, Committed by jihoon) + TAJO-1571: Merge TAJO-1497 and TAJO-1569 to 0.10.1. (jinho) TAJO-1568: Apply UnpooledByteBufAllocator when a tajo.test.enabled diff --git a/tajo-dist/src/main/conf/tajo-env.sh b/tajo-dist/src/main/conf/tajo-env.sh index 4a8e2b2ea9..353d87d27e 100755 --- a/tajo-dist/src/main/conf/tajo-env.sh +++ b/tajo-dist/src/main/conf/tajo-env.sh @@ -24,9 +24,6 @@ # Hadoop home. Required # export HADOOP_HOME= -# HBase home. optional -# export HBASE_HOME= - # The java implementation to use. Required. # export JAVA_HOME=/usr/java/default @@ -82,5 +79,5 @@ export TAJO_WORKER_STANDBY_MODE=true # Tajo PullServer mode. the default mode is standalone mode # export TAJO_PULLSERVER_STANDALONE=false -# It must be required to use HBase +# HBase home directory. It is opitional, but is required mandatorily to use HBase. # export HBASE_HOME= From b488a5e09e19fd0400147ba530b070495186c7f4 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 24 Apr 2015 11:17:00 +0900 Subject: [PATCH 022/141] TAJO-1559: Fix data model description (tinyint, smallint). Signed-off-by: Jihoon Son --- CHANGES | 3 +++ tajo-docs/src/main/sphinx/sql_language/data_model.rst | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index 831967b914..be3c1f8794 100644 --- a/CHANGES +++ b/CHANGES @@ -89,6 +89,9 @@ Release 0.10.1 - unreleased TASKS + TAJO-1559: Fix data model description (tinyint, smallint). + (Contributed by Dongjoon Hyun, Committed by jihoon) + TAJO-1575: HBASE_HOME guidance is duplicated in tajo-env.sh. (Contributed by Jongyoung Park, Committed by jihoon) diff --git a/tajo-docs/src/main/sphinx/sql_language/data_model.rst b/tajo-docs/src/main/sphinx/sql_language/data_model.rst index a0c5856614..9c52b3ec5f 100644 --- a/tajo-docs/src/main/sphinx/sql_language/data_model.rst +++ b/tajo-docs/src/main/sphinx/sql_language/data_model.rst @@ -15,7 +15,9 @@ Data Types +-----------+----------------+----------------------------+-------------+---------------------------------------------------+--------------------------------------------------------------------------+ | | varbit | bit varying | | | | +-----------+----------------+----------------------------+-------------+---------------------------------------------------+--------------------------------------------------------------------------+ -| O | smallint | tinyint, int2 | 2 | small-range integer value | -2^15 (-32,768) to 2^15 (32,767) | +| O | tinyint | int1 | 1 | tiny-range integer value | -2^7 (-128) to 2^7-1 (127) | ++-----------+----------------+----------------------------+-------------+---------------------------------------------------+--------------------------------------------------------------------------+ +| O | smallint | int2 | 2 | small-range integer value | -2^15 (-32,768) to 2^15-1 (32,767) | +-----------+----------------+----------------------------+-------------+---------------------------------------------------+--------------------------------------------------------------------------+ | O | integer | int, int4 | 4 | integer value | -2^31 (-2,147,483,648) to 2^31 - 1 (2,147,483,647) | +-----------+----------------+----------------------------+-------------+---------------------------------------------------+--------------------------------------------------------------------------+ From 9ca7688f648d433d0a0eaf5dedb1919b67f106d8 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Mon, 27 Apr 2015 14:16:54 +0900 Subject: [PATCH 023/141] TAJO-1580: Error line number is incorrect. Signed-off-by: JaeHwa Jung --- CHANGES | 3 +++ .../apache/tajo/cli/tsql/SimpleParser.java | 11 +++++----- .../tajo/cli/tsql/TestSimpleParser.java | 20 +++++++++++++++---- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/CHANGES b/CHANGES index be3c1f8794..b7012c8f87 100644 --- a/CHANGES +++ b/CHANGES @@ -27,6 +27,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1580: Error line number is incorrect. + (Contributed by Jongyoung Park. Committed by jaehwa) + TAJO-1419: Tsql session command doesn't work. (Contributed by DaeMyung Kang, Committed by jihoon) diff --git a/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/SimpleParser.java b/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/SimpleParser.java index cc772a3381..940a5ff46a 100644 --- a/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/SimpleParser.java +++ b/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/SimpleParser.java @@ -89,7 +89,10 @@ public List parseLines(String str) throws InvalidStatementExceptio // if parsing continues, it means that the previous line is broken by '\n'. // So, we should add new line to rawAppender. - if (isStatementContinue()) { + int appenderLen = rawAppender.length(); + if (appenderLen != 0 + && rawAppender.charAt(appenderLen - 1) != '\n' + && isStatementContinue()) { rawAppender.append("\n"); } @@ -180,7 +183,7 @@ public List parseLines(String str) throws InvalidStatementExceptio appendToBothStatements(chars, lineStartIdx, idx, 2); // omit two dash characters '--' from history statement int commentStartIdx = idx; idx = consumeInlineComment(chars, idx); - appendToRawStatement(str.subSequence(commentStartIdx, idx).toString(), true); + appendToRawStatement(str.subSequence(commentStartIdx, idx).toString(), false); lineStartIdx = idx; } /////////////////////////////////////////////////////// @@ -281,6 +284,7 @@ private int consumeInlineComment(char [] chars, int currentIdx) { private void appendToRawStatement(String str, boolean addLF) { if (!str.isEmpty() && !"\n".equals(str) && rawAppender.length() > 0 && addLF && rawAppender.charAt(rawAppender.length() - 1) != '\n') { + rawAppender.append("\n"); rawAppender.append(str); } else { rawAppender.append(str); @@ -297,9 +301,6 @@ private static boolean isEndOfStatement(char character) { /** * It checks if inline comment '--' begins. - * @param chars - * @param idx - * @return */ private boolean isInlineCommentStart(char[] chars, int idx) { if (idx >= chars.length - 1) { diff --git a/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestSimpleParser.java b/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestSimpleParser.java index 33a56219ea..69bf30aae8 100644 --- a/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestSimpleParser.java +++ b/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestSimpleParser.java @@ -18,9 +18,6 @@ package org.apache.tajo.cli.tsql; -import org.apache.tajo.cli.tsql.InvalidStatementException; -import org.apache.tajo.cli.tsql.ParsedResult; -import org.apache.tajo.cli.tsql.SimpleParser; import org.junit.Test; import java.util.List; @@ -159,6 +156,8 @@ public final void testParseLines() throws InvalidStatementException { assertEquals("select * from test1", res1.get(0).getHistoryStatement()); assertEquals("select * from test2", res1.get(1).getHistoryStatement()); + // select * from + // test1; select * from test2; simpleParser = new SimpleParser(); res1 = simpleParser.parseLines("select * from "); assertEquals(0, res1.size()); @@ -182,7 +181,6 @@ public final void testParseLines() throws InvalidStatementException { assertEquals("select * from test3", res1.get(0).getHistoryStatement()); assertEquals("select * from \n--test1; select * from test2;\ntest3", res1.get(0).getStatement()); - // select * from // test1 --select * from test2; // where col1 = '123'; @@ -195,6 +193,20 @@ public final void testParseLines() throws InvalidStatementException { assertEquals(1, res1.size()); assertEquals("select * from test1 where col1 = '123'", res1.get(0).getHistoryStatement()); assertEquals("select * from \ntest1 --select * from test2;\nwhere col1 = '123'", res1.get(0).getStatement()); + + // Case for sql statement already including '\n' + // This test is important for tsql because CLI input always has '\n'. + simpleParser = new SimpleParser(); + res1 = simpleParser.parseLines("select\n"); + assertEquals(0, res1.size()); + res1 = simpleParser.parseLines("*\n"); + assertEquals(0, res1.size()); + res1 = simpleParser.parseLines("from\n"); + assertEquals(0, res1.size()); + res1 = simpleParser.parseLines("test1;\n"); + assertEquals(1, res1.size()); + assertEquals("select\n*\nfrom\ntest1", res1.get(0).getStatement()); + assertEquals("select * from test1", res1.get(0).getHistoryStatement()); } @Test From c9b5e11347c4e380a3dd97671758a841941ae75b Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Mon, 27 Apr 2015 14:45:47 +0900 Subject: [PATCH 024/141] TAJO-1581: Does not update last state of query stage in non-hash shuffle. (jinho) --- CHANGES | 3 + .../org/apache/tajo/querymaster/Stage.java | 12 +-- .../tajo/querymaster/TestQueryState.java | 93 +++++++++++++++++++ 3 files changed, 102 insertions(+), 6 deletions(-) create mode 100644 tajo-core/src/test/java/org/apache/tajo/querymaster/TestQueryState.java diff --git a/CHANGES b/CHANGES index b7012c8f87..6cd7ad7372 100644 --- a/CHANGES +++ b/CHANGES @@ -27,6 +27,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1581: Does not update last state of query stage in non-hash shuffle. + (jinho) + TAJO-1580: Error line number is incorrect. (Contributed by Jongyoung Park. Committed by jaehwa) diff --git a/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java b/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java index 80ccc213e2..cf8c1cea4e 100644 --- a/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java +++ b/tajo-core/src/main/java/org/apache/tajo/querymaster/Stage.java @@ -1312,7 +1312,7 @@ private void finalizeShuffleReport(StageShuffleReportEvent event, ShuffleType ty if (!report.getReportSuccess()) { stopFinalization(); LOG.error(getId() + ", " + type + " report are failed. Caused by:" + report.getReportErrorMessage()); - eventHandler.handle(new StageEvent(getId(), StageEventType.SQ_FAILED)); + getEventHandler().handle(new StageEvent(getId(), StageEventType.SQ_FAILED)); } completedShuffleTasks.addAndGet(report.getSucceededTasks()); @@ -1324,7 +1324,7 @@ private void finalizeShuffleReport(StageShuffleReportEvent event, ShuffleType ty if (completedShuffleTasks.get() >= succeededObjectCount) { LOG.info(getId() + ", Finalized " + type + " reports: " + completedShuffleTasks.get()); - eventHandler.handle(new StageEvent(getId(), StageEventType.SQ_STAGE_COMPLETED)); + getEventHandler().handle(new StageEvent(getId(), StageEventType.SQ_STAGE_COMPLETED)); if (timeoutChecker != null) { stopFinalization(); synchronized (timeoutChecker){ @@ -1390,7 +1390,7 @@ public void run() { stage.stopFinalization(); LOG.error(stage.getId() + ": Timed out while receiving intermediate reports: " + elapsedTime + " ms, report:" + stage.completedShuffleTasks.get() + "/" + stage.succeededObjectCount); - stage.eventHandler.handle(new StageEvent(stage.getId(), StageEventType.SQ_FAILED)); + stage.getEventHandler().handle(new StageEvent(stage.getId(), StageEventType.SQ_FAILED)); } synchronized (this) { try { @@ -1404,14 +1404,14 @@ public void run() { stage.timeoutChecker.start(); } } else { - stage.handle(new StageEvent(stage.getId(), StageEventType.SQ_STAGE_COMPLETED)); + stage.getEventHandler().handle(new StageEvent(stage.getId(), StageEventType.SQ_STAGE_COMPLETED)); } } } catch (Throwable t) { LOG.error(t.getMessage(), t); stage.stopFinalization(); - stage.eventHandler.handle(new StageDiagnosticsUpdateEvent(stage.getId(), t.getMessage())); - stage.eventHandler.handle(new StageEvent(stage.getId(), StageEventType.SQ_INTERNAL_ERROR)); + stage.getEventHandler().handle(new StageDiagnosticsUpdateEvent(stage.getId(), t.getMessage())); + stage.getEventHandler().handle(new StageEvent(stage.getId(), StageEventType.SQ_INTERNAL_ERROR)); } } } diff --git a/tajo-core/src/test/java/org/apache/tajo/querymaster/TestQueryState.java b/tajo-core/src/test/java/org/apache/tajo/querymaster/TestQueryState.java new file mode 100644 index 0000000000..a822e4251d --- /dev/null +++ b/tajo-core/src/test/java/org/apache/tajo/querymaster/TestQueryState.java @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.querymaster; + +import org.apache.tajo.*; +import org.apache.tajo.client.TajoClient; +import org.apache.tajo.ipc.ClientProtos; +import org.apache.tajo.master.QueryManager; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +@Category(IntegrationTest.class) +public class TestQueryState { + private static TajoTestingCluster cluster; + private static TajoClient client; + + @BeforeClass + public static void setUp() throws Exception { + cluster = TpchTestBase.getInstance().getTestingCluster(); + client = cluster.newTajoClient(); + } + + @Test(timeout = 10000) + public void testSucceededState() throws Exception { + String queryStr = "select l_orderkey from lineitem group by l_orderkey order by l_orderkey"; + /* + ======================================================= + Block Id: eb_1429886996479_0001_000001 [LEAF] HASH_SHUFFLE + Block Id: eb_1429886996479_0001_000002 [INTERMEDIATE] RANGE_SHUFFLE + Block Id: eb_1429886996479_0001_000003 [ROOT] NONE_SHUFFLE + Block Id: eb_1429886996479_0001_000004 [TERMINAL] + ======================================================= + + The order of execution: + + 1: eb_1429886996479_0001_000001 + 2: eb_1429886996479_0001_000002 + 3: eb_1429886996479_0001_000003 + 4: eb_1429886996479_0001_000004 + */ + + ClientProtos.SubmitQueryResponse res = client.executeQuery(queryStr); + QueryId queryId = new QueryId(res.getQueryId()); + cluster.waitForQuerySubmitted(queryId); + + QueryMasterTask qmt = cluster.getQueryMasterTask(queryId); + Query query = qmt.getQuery(); + + // wait for query complete + cluster.waitForQueryState(query, TajoProtos.QueryState.QUERY_SUCCEEDED, 100); + + assertEquals(TajoProtos.QueryState.QUERY_SUCCEEDED, qmt.getState()); + + assertEquals(TajoProtos.QueryState.QUERY_SUCCEEDED, query.getSynchronizedState()); + assertEquals(TajoProtos.QueryState.QUERY_SUCCEEDED, query.getState()); + + assertFalse(query.getStages().isEmpty()); + for (Stage stage : query.getStages()) { + assertEquals(StageState.SUCCEEDED, stage.getSynchronizedState()); + assertEquals(StageState.SUCCEEDED, stage.getState()); + } + + /* wait for heartbeat from QueryMaster */ + QueryManager queryManager = cluster.getMaster().getContext().getQueryJobManager(); + for (; ; ) { + if (queryManager.getFinishedQuery(queryId) != null) break; + else Thread.sleep(100); + } + + /* get status from TajoMaster */ + assertEquals(TajoProtos.QueryState.QUERY_SUCCEEDED, client.getQueryStatus(queryId).getState()); + } +} From 7ccd8341ab45db79202a6c35ada56533107ee2aa Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 27 Apr 2015 22:35:27 +0900 Subject: [PATCH 025/141] TAJO-1574: Fix NPE on natural join. Signed-off-by: Jihoon Son --- CHANGES | 3 +++ .../java/org/apache/tajo/algebra/Join.java | 4 ++- .../tajo/engine/query/TestJoinQuery.java | 13 +++++++++ .../queries/TestJoinQuery/testNaturalJoin.sql | 3 +++ .../TestJoinQuery/testNaturalJoin.result | 27 +++++++++++++++++++ .../tajo/plan/algebra/BaseAlgebraVisitor.java | 4 ++- 6 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 tajo-core/src/test/resources/queries/TestJoinQuery/testNaturalJoin.sql create mode 100644 tajo-core/src/test/resources/results/TestJoinQuery/testNaturalJoin.result diff --git a/CHANGES b/CHANGES index 6cd7ad7372..9fdcb09c40 100644 --- a/CHANGES +++ b/CHANGES @@ -27,6 +27,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1574: Fix NPE on natural join. + (Contributed by Dongjoon Hyun, Committed by jihoon) + TAJO-1581: Does not update last state of query stage in non-hash shuffle. (jinho) diff --git a/tajo-algebra/src/main/java/org/apache/tajo/algebra/Join.java b/tajo-algebra/src/main/java/org/apache/tajo/algebra/Join.java index 2b1f3442a5..6b3ce61d89 100644 --- a/tajo-algebra/src/main/java/org/apache/tajo/algebra/Join.java +++ b/tajo-algebra/src/main/java/org/apache/tajo/algebra/Join.java @@ -96,7 +96,9 @@ public String toJson() { public Object clone() throws CloneNotSupportedException { Join join = (Join) super.clone(); join.joinType = joinType; - join.joinQual = (Expr) joinQual.clone(); + if (joinQual != null) { + join.joinQual = (Expr) joinQual.clone(); + } if (joinColumns != null) { join.joinColumns = new ColumnReferenceExpr[joinColumns.length]; for (ColumnReferenceExpr colume : joinColumns) { diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestJoinQuery.java b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestJoinQuery.java index 9ab32ff19b..f844a8f057 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestJoinQuery.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestJoinQuery.java @@ -18,6 +18,7 @@ package org.apache.tajo.engine.query; +import junit.framework.Assert; import org.apache.tajo.IntegrationTest; import org.apache.tajo.QueryTestCaseBase; import org.apache.tajo.TajoConstants; @@ -1170,4 +1171,16 @@ public final void testJoinFilterOfRowPreservedTable1() throws Exception { assertResultSet(res); cleanupQuery(res); } + + @Test + public final void testNaturalJoin() throws Exception { + ResultSet res = null; + try { + res = executeQuery(); + } catch (Exception e) { + Assert.fail(); + } + assertResultSet(res); + cleanupQuery(res); + } } diff --git a/tajo-core/src/test/resources/queries/TestJoinQuery/testNaturalJoin.sql b/tajo-core/src/test/resources/queries/TestJoinQuery/testNaturalJoin.sql new file mode 100644 index 0000000000..fcbdcdc838 --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestJoinQuery/testNaturalJoin.sql @@ -0,0 +1,3 @@ +select n1.n_name, n2.n_name +from nation n1 natural join nation n2 +order by n2.n_name; \ No newline at end of file diff --git a/tajo-core/src/test/resources/results/TestJoinQuery/testNaturalJoin.result b/tajo-core/src/test/resources/results/TestJoinQuery/testNaturalJoin.result new file mode 100644 index 0000000000..fa5a71ed07 --- /dev/null +++ b/tajo-core/src/test/resources/results/TestJoinQuery/testNaturalJoin.result @@ -0,0 +1,27 @@ +n_name,n_name +------------------------------- +ALGERIA,ALGERIA +ARGENTINA,ARGENTINA +BRAZIL,BRAZIL +CANADA,CANADA +CHINA,CHINA +EGYPT,EGYPT +ETHIOPIA,ETHIOPIA +FRANCE,FRANCE +GERMANY,GERMANY +INDIA,INDIA +INDONESIA,INDONESIA +IRAN,IRAN +IRAQ,IRAQ +JAPAN,JAPAN +JORDAN,JORDAN +KENYA,KENYA +MOROCCO,MOROCCO +MOZAMBIQUE,MOZAMBIQUE +PERU,PERU +ROMANIA,ROMANIA +RUSSIA,RUSSIA +SAUDI ARABIA,SAUDI ARABIA +UNITED KINGDOM,UNITED KINGDOM +UNITED STATES,UNITED STATES +VIETNAM,VIETNAM \ No newline at end of file diff --git a/tajo-plan/src/main/java/org/apache/tajo/plan/algebra/BaseAlgebraVisitor.java b/tajo-plan/src/main/java/org/apache/tajo/plan/algebra/BaseAlgebraVisitor.java index bd105144de..eb11f33e3c 100644 --- a/tajo-plan/src/main/java/org/apache/tajo/plan/algebra/BaseAlgebraVisitor.java +++ b/tajo-plan/src/main/java/org/apache/tajo/plan/algebra/BaseAlgebraVisitor.java @@ -364,7 +364,9 @@ public RESULT visitGroupBy(CONTEXT ctx, Stack stack, Aggregation expr) thr @Override public RESULT visitJoin(CONTEXT ctx, Stack stack, Join expr) throws PlanningException { stack.push(expr); - visit(ctx, stack, expr.getQual()); + if (expr.getQual() != null) { + visit(ctx, stack, expr.getQual()); + } visit(ctx, stack, expr.getLeft()); RESULT result = visit(ctx, stack, expr.getRight()); stack.pop(); From 4db8ca055897ba712cb2bdc40594d63905a11e64 Mon Sep 17 00:00:00 2001 From: "navis.ryu" Date: Thu, 30 Apr 2015 19:15:00 +0900 Subject: [PATCH 026/141] TAJO-1374: Support multi-bytes delimiter for CSV file. Signed-off-by: Jinho Kim --- CHANGES | 5 +- .../java/org/apache/tajo/util/BytesUtils.java | 159 ++++++++++-------- .../org/apache/tajo/util/StringUtils.java | 6 +- .../org/apache/tajo/util/TestStringUtil.java | 4 +- .../apache/tajo/engine/eval/ExprTestBase.java | 5 +- .../tajo/engine/query/TestSelectQuery.java | 24 +++ .../multibytes_delimiter1/table1.tbl | 5 + .../multibytes_delimiter2/table2.tbl | 5 + .../multibytes_delimiter_table1_ddl.sql | 3 + .../multibytes_delimiter_table2_ddl.sql | 3 + .../testMultiBytesDelimiter1.sql | 1 + .../testMultiBytesDelimiter2.sql | 1 + .../testMultiBytesDelimiter1.result | 7 + .../testMultiBytesDelimiter2.result | 7 + .../apache/tajo/storage/TestLazyTuple.java | 4 +- .../tajo/storage/hbase/ColumnMapping.java | 6 +- .../tajo/storage/hbase/HBaseScanner.java | 3 +- .../storage/hbase/HBaseStorageManager.java | 3 +- .../java/org/apache/tajo/storage/CSVFile.java | 14 +- .../sequencefile/SequenceFileScanner.java | 3 +- 20 files changed, 181 insertions(+), 87 deletions(-) create mode 100644 tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl create mode 100644 tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl create mode 100644 tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql create mode 100644 tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql create mode 100644 tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql create mode 100644 tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql create mode 100644 tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result create mode 100644 tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result diff --git a/CHANGES b/CHANGES index 9fdcb09c40..e19eaac219 100644 --- a/CHANGES +++ b/CHANGES @@ -8,7 +8,10 @@ Release 0.10.1 - unreleased (jihun) IMPROVEMENT - + + TAJO-1374: Support multi-bytes delimiter for CSV file. + (Contributed by navis, Committed by jinho) + TAJO-1400: Add TajoStatement::setMaxRows method support. (Contributed by YeonSu Han, Committed by jihoon) diff --git a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java index 91165accb9..725301cfc3 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/BytesUtils.java @@ -22,6 +22,7 @@ import java.io.ByteArrayOutputStream; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** @@ -86,22 +87,23 @@ static byte[] toASCIIBytes(char[] chars) { return buffer; } - public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int[] target) { - return splitWorker(str, 0, -1, separatorChar, true, target); + public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int[] target, int numColumns) { + return splitWorker(str, 0, -1, separatorChar, target, numColumns); } - public static byte[][] splitPreserveAllTokens(byte[] str, int offset, int length, char separatorChar, int[] target) { - return splitWorker(str, offset, length, separatorChar, true, target); + public static byte[][] splitPreserveAllTokens(byte[] str, int offset, int length, byte[] separator, int[] target, int numColumns) { + return splitWorker(str, offset, length, separator, target, numColumns); } - public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar) { - return splitWorker(str, 0, -1, separatorChar, true, null); + public static byte[][] splitPreserveAllTokens(byte[] str, char separatorChar, int numColumns) { + return splitWorker(str, 0, -1, separatorChar, null, numColumns); } - public static byte[][] splitPreserveAllTokens(byte[] str, int length, char separatorChar) { - return splitWorker(str, 0, length, separatorChar, true, null); + private static byte[][] splitWorker(byte[] str, int offset, int length, char separatorChar, + int[] target, int numColumns) { + return splitWorker(str, offset, length, new byte[] {(byte)separatorChar}, target, numColumns); } - + /** * Performs the logic for the split and * splitPreserveAllTokens methods that do not return a @@ -109,75 +111,96 @@ public static byte[][] splitPreserveAllTokens(byte[] str, int length, char separ * * @param str the String to parse, may be null * @param length amount of bytes to str - * @param separatorChar the ascii separate character - * @param preserveAllTokens if true, adjacent separators are - * treated as empty token separators; if false, adjacent - * separators are treated as one separator. + * @param separator the ascii separate characters * @param target the projection target + * @param numColumns number of columns to be retrieved * @return an array of parsed Strings, null if null String input */ - private static byte[][] splitWorker(byte[] str, int offset, int length, char separatorChar, - boolean preserveAllTokens, int[] target) { - // Performance tuned for 2.0 (JDK1.4) - + private static byte[][] splitWorker(byte[] str, int offset, int length, byte[] separator, int[] target, int numColumns) { if (str == null) { return null; } - int len = length; - if (len == 0) { - return new byte[1][0]; - }else if(len < 0){ - len = str.length - offset; - } - - List list = new ArrayList(); - int i = 0, start = 0; - boolean match = false; - boolean lastMatch = false; - int currentTarget = 0; - int currentIndex = 0; - while (i < len) { - if (str[i + offset] == separatorChar) { - if (match || preserveAllTokens) { - if (target == null) { - byte[] bytes = new byte[i - start]; - System.arraycopy(str, start + offset, bytes, 0, bytes.length); - list.add(bytes); - } else if (target.length > currentTarget && currentIndex == target[currentTarget]) { - byte[] bytes = new byte[i - start]; - System.arraycopy(str, start + offset, bytes, 0, bytes.length); - list.add(bytes); - currentTarget++; - } else { - list.add(null); - } - currentIndex++; - match = false; - lastMatch = true; - } - start = ++i; - continue; + if (length == 0) { + return new byte[numColumns][0]; + } + if (length < 0) { + length = str.length - offset; + } + int indexMax = 0; + if (target != null) { + for (int index : target) { + indexMax = Math.max(indexMax, index + 1); } - lastMatch = false; - match = true; - i++; - } - if (match || (preserveAllTokens && lastMatch)) { - if (target == null) { - byte[] bytes = new byte[i - start]; - System.arraycopy(str, start + offset, bytes, 0, bytes.length); - list.add(bytes); - } else if (target.length > currentTarget && currentIndex == target[currentTarget]) { - byte[] bytes = new byte[i - start]; - System.arraycopy(str, start + offset, bytes, 0, bytes.length); - list.add(bytes); //str.substring(start, i)); - currentTarget++; + } else { + indexMax = numColumns; + } + + int[][] indices = split(str, offset, length, separator, new int[indexMax][]); + byte[][] result = new byte[numColumns][]; + + // not-picked -> null, picked but not-exists -> byte[0] + if (target != null) { + for (int i : target) { + int[] index = indices[i]; + result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]); + } + } else { + for (int i = 0; i < result.length; i++) { + int[] index = indices[i]; + result[i] = index == null ? new byte[0] : Arrays.copyOfRange(str, index[0], index[1]); + } + } + return result; + } + + public static int[][] split(byte[] str, int offset, int length, byte[] separator, int[][] indices) { + if (indices.length == 0) { + return indices; // trivial + } + final int limit = offset + length; + + int start = offset; + int colIndex = 0; + for (int index = offset; index < limit;) { + if (onDelimiter(str, index, limit, separator)) { + indices[colIndex++] = new int[] {start, index}; + if (colIndex >= indices.length) { + return indices; + } + index += separator.length; + start = index; } else { - list.add(null); + index++; } - currentIndex++; } - return (byte[][]) list.toArray(new byte[list.size()][]); + if (colIndex < indices.length) { + indices[colIndex] = new int[]{start, limit}; + } + return indices; + } + + private static boolean onDelimiter(byte[] input, int offset, int limit, byte[] delimiter) { + for (int i = 0; i < delimiter.length; i++) { + if (offset + i >= limit || input[offset + i] != delimiter[i]) { + return false; + } + } + return true; + } + + public static byte[][] splitTrivial(byte[] value, byte delimiter) { + List split = new ArrayList(); + int prev = 0; + for (int i = 0; i < value.length; i++) { + if (value[i] == delimiter) { + split.add(Arrays.copyOfRange(value, prev, i)); + prev = i + 1; + } + } + if (prev <= value.length) { + split.add(Arrays.copyOfRange(value, prev, value.length)); + } + return split.toArray(new byte[split.size()][]); } /** diff --git a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java index 38c0fd85ee..d035e4a3dd 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/StringUtils.java @@ -186,7 +186,11 @@ public void run() { public static String unicodeEscapedDelimiter(String value) { try { String delimiter = StringEscapeUtils.unescapeJava(value); - return unicodeEscapedDelimiter(delimiter.charAt(0)); + StringBuilder builder = new StringBuilder(); + for (char achar : delimiter.toCharArray()) { + builder.append(unicodeEscapedDelimiter(achar)); + } + return builder.toString(); } catch (Throwable e) { } return value; diff --git a/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java index 52725867fb..c4329a1fd7 100644 --- a/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java +++ b/tajo-common/src/test/java/org/apache/tajo/util/TestStringUtil.java @@ -103,7 +103,7 @@ public void testSplitBytes() { char separatorChar = '|'; String[] textArray = org.apache.commons.lang.StringUtils.splitPreserveAllTokens(text, separatorChar); - byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar); + byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar, 3); assertEquals(textArray.length, bytesArray.length); for (int i = 0; i < textArray.length; i++){ @@ -118,7 +118,7 @@ public void testSplitProjectionBytes() { char separatorChar = '|'; String[] textArray = org.apache.commons.lang.StringUtils.splitPreserveAllTokens(text, separatorChar); - byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar, target); + byte[][] bytesArray = BytesUtils.splitPreserveAllTokens(text.getBytes(), separatorChar, target, 3); assertEquals(textArray.length, bytesArray.length); diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java b/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java index 4e4b710f28..876e3e4f1a 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/eval/ExprTestBase.java @@ -238,8 +238,9 @@ public void testEval(OverridableConf context, Schema schema, String tableName, S targetIdx[i] = i; } - lazyTuple = - new LazyTuple(inputSchema, BytesUtils.splitPreserveAllTokens(csvTuple.getBytes(), delimiter, targetIdx),0); + byte[][] tokens = BytesUtils.splitPreserveAllTokens( + csvTuple.getBytes(), delimiter, targetIdx, inputSchema.size()); + lazyTuple = new LazyTuple(inputSchema, tokens,0); vtuple = new VTuple(inputSchema.size()); for (int i = 0; i < inputSchema.size(); i++) { diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java index 9ba8a5692f..dd93dd14df 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java @@ -635,4 +635,28 @@ public void testTimezonedTable5() throws Exception { testingCluster.getConfiguration().setSystemTimezone(TimeZone.getTimeZone("GMT")); } } + + @Test + public void testMultiBytesDelimiter1() throws Exception { + executeDDL("multibytes_delimiter_table1_ddl.sql", "multibytes_delimiter1"); + try { + ResultSet res = executeQuery(); + assertResultSet(res); + cleanupQuery(res); + } finally { + executeString("DROP TABLE table1"); + } + } + + @Test + public void testMultiBytesDelimiter2() throws Exception { + executeDDL("multibytes_delimiter_table2_ddl.sql", "multibytes_delimiter2"); + try { + ResultSet res = executeQuery(); + assertResultSet(res); + cleanupQuery(res); + } finally { + executeString("DROP TABLE table2"); + } + } } \ No newline at end of file diff --git a/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl new file mode 100644 index 0000000000..5acccf6582 --- /dev/null +++ b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter1/table1.tbl @@ -0,0 +1,5 @@ +1||ooo||1.1||a +2||ppp||2.3|| +3||qqq|||| +4||||4.5|| +||xxx||5.6||e \ No newline at end of file diff --git a/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl new file mode 100644 index 0000000000..b26cdfd08f --- /dev/null +++ b/tajo-core/src/test/resources/dataset/TestSelectQuery/multibytes_delimiter2/table2.tbl @@ -0,0 +1,5 @@ +1ㅎoooㅎ1.1ㅎa +2ㅎpppㅎ2.3ㅎ +3ㅎqqqㅎㅎ +4ㅎㅎ4.5ㅎ +ㅎxxxㅎ5.6ㅎe \ No newline at end of file diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql new file mode 100644 index 0000000000..2b4a2ce29c --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table1_ddl.sql @@ -0,0 +1,3 @@ +create external table table1 (id int, name text, score float, type text) using csv +with ('csvfile.delimiter'='||', 'csvfile.null'='NULL') location ${table.path}; + diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql new file mode 100644 index 0000000000..d918ac6aac --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table2_ddl.sql @@ -0,0 +1,3 @@ +create external table table2 (id int, name text, score float, type text) using csv +with ('csvfile.delimiter'='ㅎ', 'csvfile.null'='NULL') location ${table.path}; + diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql new file mode 100644 index 0000000000..bd6b02daaf --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter1.sql @@ -0,0 +1 @@ +select * from table1; \ No newline at end of file diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql new file mode 100644 index 0000000000..66a69ec071 --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter2.sql @@ -0,0 +1 @@ +select * from table2; \ No newline at end of file diff --git a/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result new file mode 100644 index 0000000000..d8d43b1cd7 --- /dev/null +++ b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter1.result @@ -0,0 +1,7 @@ +id,name,score,type +------------------------------- +1,ooo,1.1,a +2,ppp,2.3, +3,qqq,null, +4,,4.5, +null,xxx,5.6,e \ No newline at end of file diff --git a/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result new file mode 100644 index 0000000000..d8d43b1cd7 --- /dev/null +++ b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter2.result @@ -0,0 +1,7 @@ +id,name,score,type +------------------------------- +1,ooo,1.1,a +2,ppp,2.3, +3,qqq,null, +4,,4.5, +null,xxx,5.6,e \ No newline at end of file diff --git a/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java b/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java index c6149f72df..fccaf2a109 100644 --- a/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java +++ b/tajo-storage/tajo-storage-common/src/test/java/org/apache/tajo/storage/TestLazyTuple.java @@ -69,7 +69,7 @@ public void setUp() { sb.append(DatumFactory.createInet4("192.168.0.1")).append('|'); sb.append(new String(nullbytes)).append('|'); sb.append(NullDatum.get()); - textRow = BytesUtils.splitPreserveAllTokens(sb.toString().getBytes(), '|'); + textRow = BytesUtils.splitPreserveAllTokens(sb.toString().getBytes(), '|', 13); serde = new TextSerializerDeserializer(); } @@ -220,7 +220,7 @@ public void testPutTuple() { @Test public void testInvalidNumber() { - byte[][] bytes = BytesUtils.splitPreserveAllTokens(" 1| |2 ||".getBytes(), '|'); + byte[][] bytes = BytesUtils.splitPreserveAllTokens(" 1| |2 ||".getBytes(), '|', 5); Schema schema = new Schema(); schema.addColumn("col1", TajoDataTypes.Type.INT2); schema.addColumn("col2", TajoDataTypes.Type.INT4); diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java index 7ddf09a204..c3094fd21c 100644 --- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java +++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/ColumnMapping.java @@ -85,7 +85,7 @@ public void init() throws IOException { for (String eachToken: columnMappingTokens) { mappingColumns[index] = new byte[2][]; - byte[][] mappingTokens = BytesUtils.splitPreserveAllTokens(eachToken.trim().getBytes(), ':'); + byte[][] mappingTokens = BytesUtils.splitTrivial(eachToken.trim().getBytes(), (byte)':'); if (mappingTokens.length == 3) { if (mappingTokens[0].length == 0) { @@ -230,6 +230,10 @@ public int getNumRowKeys() { return numRowKeys; } + public int getNumColumns() { + return schema.size(); + } + public boolean[] getIsColumnValues() { return isColumnValues; } diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java index 5cae077431..ab5625220c 100644 --- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java +++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseScanner.java @@ -218,7 +218,8 @@ private Datum getDatum(Result result, int fieldId) throws IOException { if (!isBinaryColumns[fieldId] && rowKeyFieldIndexes[fieldId] >= 0) { int rowKeyFieldIndex = rowKeyFieldIndexes[fieldId]; - byte[][] rowKeyFields = BytesUtils.splitPreserveAllTokens(value, rowKeyDelimiter); + byte[][] rowKeyFields = BytesUtils.splitPreserveAllTokens( + value, rowKeyDelimiter, columnMapping.getNumColumns()); if (rowKeyFields.length < rowKeyFieldIndex) { return NullDatum.get(); diff --git a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java index 2a635d8cd1..a9e5bdee55 100644 --- a/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java +++ b/tajo-storage/tajo-storage-hbase/src/main/java/org/apache/tajo/storage/hbase/HBaseStorageManager.java @@ -1015,7 +1015,8 @@ public TupleRange[] getInsertSortRanges(OverridableConf queryContext, TableDesc Tuple endTuple = new VTuple(sortSpecs.length); byte[][] rowKeyFields; if (sortSpecs.length > 1) { - byte[][] splitValues = BytesUtils.splitPreserveAllTokens(eachEndKey, columnMapping.getRowKeyDelimiter()); + byte[][] splitValues = BytesUtils.splitPreserveAllTokens( + eachEndKey, columnMapping.getRowKeyDelimiter(), columnMapping.getNumColumns()); if (splitValues.length == sortSpecs.length) { rowKeyFields = splitValues; } else { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java index dd5366c045..bb628b1496 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java @@ -61,7 +61,7 @@ public static class CSVAppender extends FileAppender { private FSDataOutputStream fos; private DataOutputStream outputStream; private CompressionOutputStream deflateFilter; - private char delimiter; + private byte[] delimiter; private TableStatistics stats = null; private Compressor compressor; private CompressionCodecFactory codecFactory; @@ -83,7 +83,7 @@ public CSVAppender(Configuration conf, final TaskAttemptId taskAttemptId, this.meta = meta; this.schema = schema; this.delimiter = StringEscapeUtils.unescapeJava( - this.meta.getOption(StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER)).charAt(0); + this.meta.getOption(StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER)).getBytes(); this.columnNum = schema.size(); @@ -169,8 +169,8 @@ public void addTuple(Tuple tuple) throws IOException { rowBytes += serde.serialize(schema.getColumn(i), datum, os, nullChars); if(columnNum - 1 > i){ - os.write((byte) delimiter); - rowBytes += 1; + os.write(delimiter); + rowBytes += delimiter.length; } if (isShuffle) { // it is to calculate min/max values, and it is only used for the intermediate file. @@ -265,7 +265,7 @@ public CSVScanner(Configuration conf, final Schema schema, final TableMeta meta, //Delimiter this.delimiter = StringEscapeUtils.unescapeJava( meta.getOption(StorageConstants.TEXT_DELIMITER, - meta.getOption(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER))).charAt(0); + meta.getOption(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER))).getBytes(); String nullCharacters = StringEscapeUtils.unescapeJava( meta.getOption(StorageConstants.TEXT_NULL, @@ -279,7 +279,7 @@ public CSVScanner(Configuration conf, final Schema schema, final TableMeta meta, } private final static int DEFAULT_PAGE_SIZE = 256 * 1024; - private char delimiter; + private byte[] delimiter; private FileSystem fs; private FSDataInputStream fis; private InputStream is; //decompressd stream @@ -476,7 +476,7 @@ public Tuple next() throws IOException { } byte[][] cells = BytesUtils.splitPreserveAllTokens(buffer.getData(), startOffsets.get(currentIdx), - rowLengthList.get(currentIdx), delimiter, targetColumnIndexes); + rowLengthList.get(currentIdx), delimiter, targetColumnIndexes, schema.size()); currentIdx++; return new LazyTuple(schema, cells, offset, nullChars, serde); } catch (Throwable t) { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java index 74563fff10..92a041cfc7 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/sequencefile/SequenceFileScanner.java @@ -171,7 +171,8 @@ public Tuple next() throws IOException { } else { Text text = new Text(); reader.getCurrentValue(text); - cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), delimiter, projectionMap); + cells = BytesUtils.splitPreserveAllTokens(text.getBytes(), + delimiter, projectionMap, schema.getColumns().size()); totalBytes += (long)text.getBytes().length; tuple = new LazyTuple(schema, cells, 0, nullChars, serde); } From b0ab7eafd35a51bfeddb3812bff8718446029f56 Mon Sep 17 00:00:00 2001 From: "navis.ryu" Date: Thu, 30 Apr 2015 19:17:45 +0900 Subject: [PATCH 027/141] TAJO-1381: Support multi-bytes delimiter for Text file. Signed-off-by: Jinho Kim --- CHANGES | 3 ++ .../tajo/engine/query/TestSelectQuery.java | 24 ++++++++++ .../multibytes_delimiter_table3_ddl.sql | 3 ++ .../multibytes_delimiter_table4_ddl.sql | 3 ++ .../testMultiBytesDelimiter3.sql | 1 + .../testMultiBytesDelimiter4.sql | 1 + .../testMultiBytesDelimiter3.result | 7 +++ .../testMultiBytesDelimiter4.result | 7 +++ .../java/org/apache/tajo/storage/CSVFile.java | 11 +++-- .../storage/text/CSVLineDeserializer.java | 14 ++++-- .../tajo/storage/text/CSVLineSerDe.java | 5 ++- .../tajo/storage/text/CSVLineSerializer.java | 8 ++-- .../tajo/storage/text/DelimitedTextFile.java | 2 +- .../storage/text/FieldSplitProcessor.java | 8 +--- .../text/MultiBytesFieldSplitProcessor.java | 45 +++++++++++++++++++ .../storage/text/TextLineDeserializer.java | 6 +-- .../tajo/storage/text/TextLineSerDe.java | 3 +- .../tajo/storage/TestSplitProcessor.java | 38 ++++++++++++++-- 18 files changed, 162 insertions(+), 27 deletions(-) create mode 100644 tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table3_ddl.sql create mode 100644 tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table4_ddl.sql create mode 100644 tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter3.sql create mode 100644 tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter4.sql create mode 100644 tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter3.result create mode 100644 tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter4.result create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/MultiBytesFieldSplitProcessor.java diff --git a/CHANGES b/CHANGES index e19eaac219..cc54cc19d8 100644 --- a/CHANGES +++ b/CHANGES @@ -8,6 +8,9 @@ Release 0.10.1 - unreleased (jihun) IMPROVEMENT + + TAJO-1381: Support multi-bytes delimiter for Text file. + (Contributed by navis, Committed by jinho) TAJO-1374: Support multi-bytes delimiter for CSV file. (Contributed by navis, Committed by jinho) diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java index dd93dd14df..f7b1382db7 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java @@ -659,4 +659,28 @@ public void testMultiBytesDelimiter2() throws Exception { executeString("DROP TABLE table2"); } } + + @Test + public void testMultiBytesDelimiter3() throws Exception { + executeDDL("multibytes_delimiter_table3_ddl.sql", "multibytes_delimiter1"); + try { + ResultSet res = executeQuery(); + assertResultSet(res); + cleanupQuery(res); + } finally { + executeString("DROP TABLE table1"); + } + } + + @Test + public void testMultiBytesDelimiter4() throws Exception { + executeDDL("multibytes_delimiter_table4_ddl.sql", "multibytes_delimiter2"); + try { + ResultSet res = executeQuery(); + assertResultSet(res); + cleanupQuery(res); + } finally { + executeString("DROP TABLE table2"); + } + } } \ No newline at end of file diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table3_ddl.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table3_ddl.sql new file mode 100644 index 0000000000..8309d119ef --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table3_ddl.sql @@ -0,0 +1,3 @@ +create external table table1 (id int, name text, score float, type text) using text +with ('text.delimiter'='||', 'text.null'='NULL') location ${table.path}; + diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table4_ddl.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table4_ddl.sql new file mode 100644 index 0000000000..2fb821aba5 --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/multibytes_delimiter_table4_ddl.sql @@ -0,0 +1,3 @@ +create external table table2 (id int, name text, score float, type text) using text +with ('text.delimiter'='ㅎ', 'text.null'='NULL') location ${table.path}; + diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter3.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter3.sql new file mode 100644 index 0000000000..bd6b02daaf --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter3.sql @@ -0,0 +1 @@ +select * from table1; \ No newline at end of file diff --git a/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter4.sql b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter4.sql new file mode 100644 index 0000000000..66a69ec071 --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestSelectQuery/testMultiBytesDelimiter4.sql @@ -0,0 +1 @@ +select * from table2; \ No newline at end of file diff --git a/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter3.result b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter3.result new file mode 100644 index 0000000000..d8d43b1cd7 --- /dev/null +++ b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter3.result @@ -0,0 +1,7 @@ +id,name,score,type +------------------------------- +1,ooo,1.1,a +2,ppp,2.3, +3,qqq,null, +4,,4.5, +null,xxx,5.6,e \ No newline at end of file diff --git a/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter4.result b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter4.result new file mode 100644 index 0000000000..d8d43b1cd7 --- /dev/null +++ b/tajo-core/src/test/resources/results/TestSelectQuery/testMultiBytesDelimiter4.result @@ -0,0 +1,7 @@ +id,name,score,type +------------------------------- +1,ooo,1.1,a +2,ppp,2.3, +3,qqq,null, +4,,4.5, +null,xxx,5.6,e \ No newline at end of file diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java index bb628b1496..c1047d980a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/CSVFile.java @@ -40,6 +40,7 @@ import org.apache.tajo.storage.exception.AlreadyExistsStorageException; import org.apache.tajo.storage.fragment.Fragment; import org.apache.tajo.storage.rcfile.NonSyncByteArrayOutputStream; +import org.apache.tajo.util.Bytes; import org.apache.tajo.util.BytesUtils; import java.io.*; @@ -83,7 +84,8 @@ public CSVAppender(Configuration conf, final TaskAttemptId taskAttemptId, this.meta = meta; this.schema = schema; this.delimiter = StringEscapeUtils.unescapeJava( - this.meta.getOption(StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER)).getBytes(); + this.meta.getOption(StorageConstants.TEXT_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER)) + .getBytes(Bytes.UTF8_CHARSET); this.columnNum = schema.size(); @@ -93,7 +95,7 @@ public CSVAppender(Configuration conf, final TaskAttemptId taskAttemptId, if (StringUtils.isEmpty(nullCharacters)) { nullChars = NullDatum.get().asTextBytes(); } else { - nullChars = nullCharacters.getBytes(); + nullChars = nullCharacters.getBytes(Bytes.UTF8_CHARSET); } } @@ -265,7 +267,8 @@ public CSVScanner(Configuration conf, final Schema schema, final TableMeta meta, //Delimiter this.delimiter = StringEscapeUtils.unescapeJava( meta.getOption(StorageConstants.TEXT_DELIMITER, - meta.getOption(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER))).getBytes(); + meta.getOption(StorageConstants.CSVFILE_DELIMITER, StorageConstants.DEFAULT_FIELD_DELIMITER))) + .getBytes(Bytes.UTF8_CHARSET); String nullCharacters = StringEscapeUtils.unescapeJava( meta.getOption(StorageConstants.TEXT_NULL, @@ -274,7 +277,7 @@ public CSVScanner(Configuration conf, final Schema schema, final TableMeta meta, if (StringUtils.isEmpty(nullCharacters)) { nullChars = NullDatum.get().asTextBytes(); } else { - nullChars = nullCharacters.getBytes(); + nullChars = nullCharacters.getBytes(Bytes.UTF8_CHARSET); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java index 1599f62ad4..6a8c7a9239 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java @@ -19,6 +19,7 @@ package org.apache.tajo.storage.text; import io.netty.buffer.ByteBuf; +import io.netty.buffer.ByteBufProcessor; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.datum.Datum; @@ -28,9 +29,10 @@ import java.io.IOException; public class CSVLineDeserializer extends TextLineDeserializer { - private FieldSplitProcessor processor; + private ByteBufProcessor processor; private FieldSerializerDeserializer fieldSerDer; private ByteBuf nullChars; + private int delimiterCompensation; public CSVLineDeserializer(Schema schema, TableMeta meta, int[] targetColumnIndexes) { super(schema, meta, targetColumnIndexes); @@ -38,7 +40,13 @@ public CSVLineDeserializer(Schema schema, TableMeta meta, int[] targetColumnInde @Override public void init() { - this.processor = new FieldSplitProcessor(CSVLineSerDe.getFieldDelimiter(meta)); + byte[] delimiter = CSVLineSerDe.getFieldDelimiter(meta); + if (delimiter.length == 1) { + this.processor = new FieldSplitProcessor(delimiter[0]); + } else { + this.processor = new MultiBytesFieldSplitProcessor(delimiter); + } + this.delimiterCompensation = delimiter.length - 1; if (nullChars != null) { nullChars.release(); @@ -67,7 +75,7 @@ public void deserialize(final ByteBuf lineBuf, Tuple output) throws IOException, if (end < 0) { fieldLength = rowLength - start; } else { - fieldLength = end - start; + fieldLength = end - start - delimiterCompensation; } if (projection.length > currentTarget && currentIndex == projection[currentTarget]) { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerDe.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerDe.java index 2fe7f239fe..988d5d1457 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerDe.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerDe.java @@ -22,6 +22,7 @@ import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.storage.StorageConstants; +import org.apache.tajo.util.Bytes; public class CSVLineSerDe extends TextLineSerDe { @Override @@ -34,8 +35,8 @@ public TextLineSerializer createSerializer(Schema schema, TableMeta meta) { return new CSVLineSerializer(schema, meta); } - public static char getFieldDelimiter(TableMeta meta) { + public static byte[] getFieldDelimiter(TableMeta meta) { return StringEscapeUtils.unescapeJava(meta.getOption(StorageConstants.TEXT_DELIMITER, - StorageConstants.DEFAULT_FIELD_DELIMITER)).charAt(0); + StorageConstants.DEFAULT_FIELD_DELIMITER)).getBytes(Bytes.UTF8_CHARSET); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerializer.java index 53a0ef3b37..9a2fe3745b 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineSerializer.java @@ -30,8 +30,8 @@ public class CSVLineSerializer extends TextLineSerializer { private FieldSerializerDeserializer serde; - private byte [] nullChars; - private char delimiter; + private byte[] nullChars; + private byte[] delimiter; private int columnNum; public CSVLineSerializer(Schema schema, TableMeta meta) { @@ -56,8 +56,8 @@ public int serialize(OutputStream out, Tuple input) throws IOException { writtenBytes += serde.serialize(out, datum, schema.getColumn(i), i, nullChars); if (columnNum - 1 > i) { - out.write((byte) delimiter); - writtenBytes += 1; + out.write(delimiter); + writtenBytes += delimiter.length; } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedTextFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedTextFile.java index ebf9608e1d..4c9234e7af 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedTextFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/DelimitedTextFile.java @@ -391,7 +391,7 @@ public Tuple next() throws IOException { try { deserializer.deserialize(buf, tuple); - // if a line is read normaly, it exists this loop. + // if a line is read normally, it exists this loop. break; } catch (TextLineParsingError tae) { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/FieldSplitProcessor.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/FieldSplitProcessor.java index a5ac142f67..862b5ae155 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/FieldSplitProcessor.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/FieldSplitProcessor.java @@ -21,9 +21,9 @@ import io.netty.buffer.ByteBufProcessor; public class FieldSplitProcessor implements ByteBufProcessor { - private char delimiter; //the ascii separate character + private byte delimiter; //the ascii separate character - public FieldSplitProcessor(char recordDelimiterByte) { + public FieldSplitProcessor(byte recordDelimiterByte) { this.delimiter = recordDelimiterByte; } @@ -31,8 +31,4 @@ public FieldSplitProcessor(char recordDelimiterByte) { public boolean process(byte value) throws Exception { return delimiter != value; } - - public char getDelimiter() { - return delimiter; - } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/MultiBytesFieldSplitProcessor.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/MultiBytesFieldSplitProcessor.java new file mode 100644 index 0000000000..b97d7c6259 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/MultiBytesFieldSplitProcessor.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.text; + +import io.netty.buffer.ByteBufProcessor; + +public class MultiBytesFieldSplitProcessor implements ByteBufProcessor { + + private int index; + private final byte[] delimiter; + + public MultiBytesFieldSplitProcessor(byte[] recordDelimiterByte) { + this.delimiter = recordDelimiterByte; + } + + @Override + public boolean process(byte value) throws Exception { + if (delimiter[index] != value) { + index = 0; + return true; + } + if (index != delimiter.length - 1) { + index++; + return true; + } + index = 0; + return false; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineDeserializer.java index 7ebfa79ce1..89a7de9032 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineDeserializer.java @@ -29,9 +29,9 @@ * Reads a text line and fills a Tuple with values */ public abstract class TextLineDeserializer { - protected Schema schema; - protected TableMeta meta; - protected int [] targetColumnIndexes; + protected final Schema schema; + protected final TableMeta meta; + protected final int[] targetColumnIndexes; public TextLineDeserializer(Schema schema, TableMeta meta, int [] targetColumnIndexes) { this.schema = schema; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerDe.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerDe.java index e81e289e76..1a53bb0a7a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerDe.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextLineSerDe.java @@ -26,6 +26,7 @@ import org.apache.tajo.datum.NullDatum; import org.apache.tajo.storage.BufferPool; import org.apache.tajo.storage.StorageConstants; +import org.apache.tajo.util.Bytes; /** * Pluggable Text Line SerDe class @@ -56,7 +57,7 @@ public static ByteBuf getNullChars(TableMeta meta) { if (StringUtils.isEmpty(nullCharacters)) { nullChars = NullDatum.get().asTextBytes(); } else { - nullChars = nullCharacters.getBytes(); + nullChars = nullCharacters.getBytes(Bytes.UTF8_CHARSET); } return nullChars; diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestSplitProcessor.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestSplitProcessor.java index 12ea5510f2..2174d62396 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestSplitProcessor.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestSplitProcessor.java @@ -19,10 +19,12 @@ package org.apache.tajo.storage; import io.netty.buffer.ByteBuf; +import io.netty.buffer.ByteBufProcessor; import io.netty.buffer.Unpooled; import io.netty.util.CharsetUtil; import org.apache.tajo.storage.text.FieldSplitProcessor; import org.apache.tajo.storage.text.LineSplitProcessor; +import org.apache.tajo.storage.text.MultiBytesFieldSplitProcessor; import org.junit.Test; import java.io.IOException; @@ -35,17 +37,47 @@ public class TestSplitProcessor { @Test public void testFieldSplitProcessor() throws IOException { - String data = "abc||de"; + String data = "abc||de|"; final ByteBuf buf = releaseLater( Unpooled.copiedBuffer(data, CharsetUtil.ISO_8859_1)); final int len = buf.readableBytes(); - FieldSplitProcessor processor = new FieldSplitProcessor('|'); + FieldSplitProcessor processor = new FieldSplitProcessor((byte)'|'); assertEquals(3, buf.forEachByte(0, len, processor)); assertEquals(4, buf.forEachByte(4, len - 4, processor)); - assertEquals(-1, buf.forEachByte(5, len - 5, processor)); + assertEquals(7, buf.forEachByte(5, len - 5, processor)); + assertEquals(-1, buf.forEachByte(8, len - 8, processor)); + } + + @Test + public void testMultiCharFieldSplitProcessor1() throws IOException { + String data = "abc||||de||"; + final ByteBuf buf = releaseLater( + Unpooled.copiedBuffer(data, CharsetUtil.ISO_8859_1)); + + final int len = buf.readableBytes(); + ByteBufProcessor processor = new MultiBytesFieldSplitProcessor("||".getBytes()); + + assertEquals(4, buf.forEachByte(0, len, processor)); + assertEquals(6, buf.forEachByte(5, len - 5, processor)); + assertEquals(10, buf.forEachByte(7, len - 7, processor)); + assertEquals(-1, buf.forEachByte(11, len - 11, processor)); + } + + @Test + public void testMultiCharFieldSplitProcessor2() throws IOException { + String data = "abcㅎㅎdeㅎ"; + final ByteBuf buf = releaseLater( + Unpooled.copiedBuffer(data, CharsetUtil.UTF_8)); + + final int len = buf.readableBytes(); + ByteBufProcessor processor = new MultiBytesFieldSplitProcessor("ㅎ".getBytes()); + assertEquals(5, buf.forEachByte(0, len, processor)); + assertEquals(8, buf.forEachByte(6, len - 6, processor)); + assertEquals(13, buf.forEachByte(9, len - 9, processor)); + assertEquals(-1, buf.forEachByte(14, len - 14, processor)); } @Test From f9a531c0ff9ac4fadb5562eeb6b0b48da44cf547 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Wed, 6 May 2015 12:09:39 +0900 Subject: [PATCH 028/141] TAJO-1534: DelimitedTextFile return null instead of a NullDatum. (jinho) --- CHANGES | 2 + .../apache/tajo/storage/rcfile/RCFile.java | 14 ++--- .../storage/text/CSVLineDeserializer.java | 18 +++++- .../org/apache/tajo/storage/TestStorages.java | 59 +++++++++++++++++++ 4 files changed, 83 insertions(+), 10 deletions(-) diff --git a/CHANGES b/CHANGES index cc54cc19d8..a184d6cdf9 100644 --- a/CHANGES +++ b/CHANGES @@ -33,6 +33,8 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1534: DelimitedTextFile return null instead of a NullDatum. (jinho) + TAJO-1574: Fix NPE on natural join. (Contributed by Dongjoon Hyun, Committed by jihoon) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rcfile/RCFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rcfile/RCFile.java index 44aabd453e..62e5ed9dca 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rcfile/RCFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/rcfile/RCFile.java @@ -1255,17 +1255,18 @@ public void init() throws IOException { for (int i = 0; i < targetColumnIndexes.length; i++) { int tid = targetColumnIndexes[i]; + SelectedColumn col = new SelectedColumn(); + col.colIndex = tid; if (tid < columnNumber) { skippedColIDs[tid] = false; - - SelectedColumn col = new SelectedColumn(); - col.colIndex = tid; col.runLength = 0; col.prvLength = -1; col.rowReadIndex = 0; - selectedColumns[i] = col; colValLenBufferReadIn[i] = new NonSyncDataInputBuffer(); + } else { + col.isNulled = true; } + selectedColumns[i] = col; } currentKey = createKeyBuffer(); @@ -1583,10 +1584,7 @@ protected int nextKeyBuffer() throws IOException { for (int selIx = 0; selIx < selectedColumns.length; selIx++) { SelectedColumn col = selectedColumns[selIx]; - if (col == null) { - col = new SelectedColumn(); - col.isNulled = true; - selectedColumns[selIx] = col; + if (col.isNulled) { continue; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java index 6a8c7a9239..03a0a261c6 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/CSVLineDeserializer.java @@ -23,6 +23,7 @@ import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.datum.Datum; +import org.apache.tajo.datum.NullDatum; import org.apache.tajo.storage.FieldSerializerDeserializer; import org.apache.tajo.storage.Tuple; @@ -80,8 +81,14 @@ public void deserialize(final ByteBuf lineBuf, Tuple output) throws IOException, if (projection.length > currentTarget && currentIndex == projection[currentTarget]) { lineBuf.setIndex(start, start + fieldLength); - Datum datum = fieldSerDer.deserialize(lineBuf, schema.getColumn(currentIndex), currentIndex, nullChars); - output.put(currentIndex, datum); + + try { + Datum datum = fieldSerDer.deserialize(lineBuf, schema.getColumn(currentIndex), currentIndex, nullChars); + output.put(currentIndex, datum); + } catch (Exception e) { + output.put(currentIndex, NullDatum.get()); + } + currentTarget++; } @@ -92,6 +99,13 @@ public void deserialize(final ByteBuf lineBuf, Tuple output) throws IOException, start = end + 1; currentIndex++; } + + /* If a text row is less than table schema size, tuple should set to NullDatum */ + if (projection.length > currentTarget) { + for (; currentTarget < projection.length; currentTarget++) { + output.put(projection[currentTarget], NullDatum.get()); + } + } } @Override diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java index 9577e3da96..b73fb5baf8 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java @@ -952,4 +952,63 @@ int record = 4 + 8 + 2 + 5 + 8; // required size is 27 StorageManager.clearCache(); } } + + @Test + public void testLessThanSchemaSize() throws IOException { + /* RAW is internal storage. It must be same with schema size */ + if (storeType == StoreType.RAW || storeType == StoreType.AVRO){ + return; + } + + Schema dataSchema = new Schema(); + dataSchema.addColumn("col1", Type.FLOAT4); + dataSchema.addColumn("col2", Type.FLOAT8); + dataSchema.addColumn("col3", Type.INT2); + + KeyValueSet options = new KeyValueSet(); + TableMeta meta = CatalogUtil.newTableMeta(storeType, options); + meta.setOptions(CatalogUtil.newPhysicalProperties(storeType)); + + Path tablePath = new Path(testDir, "testLessThanSchemaSize.data"); + FileStorageManager sm = (FileStorageManager) StorageManager.getFileStorageManager(conf); + Appender appender = sm.getAppender(meta, dataSchema, tablePath); + appender.init(); + + + Tuple expect = new VTuple(dataSchema.size()); + expect.put(new Datum[]{ + DatumFactory.createFloat4(Float.MAX_VALUE), + DatumFactory.createFloat8(Double.MAX_VALUE), + DatumFactory.createInt2(Short.MAX_VALUE) + }); + + appender.addTuple(expect); + appender.flush(); + appender.close(); + + assertTrue(fs.exists(tablePath)); + FileStatus status = fs.getFileStatus(tablePath); + Schema inSchema = new Schema(); + inSchema.addColumn("col1", Type.FLOAT4); + inSchema.addColumn("col2", Type.FLOAT8); + inSchema.addColumn("col3", Type.INT2); + inSchema.addColumn("col4", Type.INT4); + inSchema.addColumn("col5", Type.INT8); + + FileFragment fragment = new FileFragment("table", tablePath, 0, status.getLen()); + Scanner scanner = StorageManager.getFileStorageManager(conf).getScanner(meta, inSchema, fragment); + + Schema target = new Schema(); + + target.addColumn("col2", Type.FLOAT8); + target.addColumn("col5", Type.INT8); + scanner.setTarget(target.toArray()); + scanner.init(); + + Tuple tuple = scanner.next(); + scanner.close(); + + assertEquals(expect.get(1), tuple.get(1)); + assertEquals(NullDatum.get(), tuple.get(4)); + } } From 9d331f9af11d77f16d332029ad7debe932689cf2 Mon Sep 17 00:00:00 2001 From: Yongjin Choi Date: Wed, 6 May 2015 18:47:38 +0900 Subject: [PATCH 029/141] TAJO-1556: "insert into select" with reordered column list does not work. Signed-off-by: Jihoon Son --- CHANGES | 3 +++ .../java/org/apache/tajo/catalog/Schema.java | 12 ++++++--- .../engine/planner/TestLogicalPlanner.java | 23 ++++++++++++++-- .../tajo/engine/query/TestInsertQuery.java | 19 +++++++++++++ .../TestInsertQuery/nation_diff_col_order.ddl | 1 + .../testInsertWithDifferentColumnOrder.sql | 1 + .../testInsertWithDifferentColumnOrder.result | 27 +++++++++++++++++++ .../org/apache/tajo/plan/LogicalPlanner.java | 9 ++++--- 8 files changed, 86 insertions(+), 9 deletions(-) create mode 100644 tajo-core/src/test/resources/queries/TestInsertQuery/nation_diff_col_order.ddl create mode 100644 tajo-core/src/test/resources/queries/TestInsertQuery/testInsertWithDifferentColumnOrder.sql create mode 100644 tajo-core/src/test/resources/results/TestInsertQuery/testInsertWithDifferentColumnOrder.result diff --git a/CHANGES b/CHANGES index a184d6cdf9..de1144e0c9 100644 --- a/CHANGES +++ b/CHANGES @@ -33,6 +33,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1556: "insert into select" with reordered column list does not work. + (Contributed by Yongjin Choi, Committed by jihoon) + TAJO-1534: DelimitedTextFile return null instead of a NullDatum. (jinho) TAJO-1574: Fix NPE on natural join. diff --git a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/Schema.java b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/Schema.java index 71c1b01dc2..078f8a9f81 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/Schema.java +++ b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/Schema.java @@ -121,13 +121,19 @@ public Column getColumn(int id) { } public Column getColumn(Column column) { + int idx = getIndex(column); + return idx >= 0 ? fields.get(idx) : null; + } + + public int getIndex(Column column) { if (!contains(column)) { - return null; + return -1; } + if (column.hasQualifier()) { - return fields.get(fieldsByQualifiedName.get(column.getQualifiedName())); + return fieldsByQualifiedName.get(column.getQualifiedName()); } else { - return fields.get(fieldsByName.get(column.getSimpleName()).get(0)); + return fieldsByName.get(column.getSimpleName()).get(0); } } diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestLogicalPlanner.java b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestLogicalPlanner.java index 0b59bc7a35..af0aa6aebc 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestLogicalPlanner.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestLogicalPlanner.java @@ -1106,7 +1106,7 @@ public void testJsonSerDerObject(LogicalNode rootNode) { // Table descriptions // // employee (name text, empid int4, deptname text) - // dept (deptname text, nameger text) + // dept (deptname text, manager text) // score (deptname text, score inet4) static final String [] insertStatements = { @@ -1115,7 +1115,8 @@ public void testJsonSerDerObject(LogicalNode rootNode) { "insert into employee (name, deptname) select * from dept", // 2 "insert into location '/tmp/data' select name, empid from employee", // 3 "insert overwrite into employee (name, deptname) select * from dept", // 4 - "insert overwrite into LOCATION '/tmp/data' select * from dept" // 5 + "insert overwrite into LOCATION '/tmp/data' select * from dept", // 5 + "insert into employee (deptname, name) select deptname, manager from dept" // 6 }; @Test @@ -1198,6 +1199,24 @@ public final void testInsertInto5() throws PlanningException { assertTrue(insertNode.hasPath()); } + @Test + public final void testInsertInto6() throws PlanningException { + QueryContext qc = new QueryContext(util.getConfiguration(), session); + + Expr expr = sqlAnalyzer.parse(insertStatements[6]); + LogicalPlan plan = planner.createPlan(qc, expr); + assertEquals(1, plan.getQueryBlocks().size()); + InsertNode insertNode = getInsertNode(plan); + + ProjectionNode subquery = insertNode.getChild(); + Target[] targets = subquery.getTargets(); + // targets MUST be manager, NULL as empid, deptname + assertEquals(targets[0].getNamedColumn().getQualifiedName(), "default.dept.manager"); + assertEquals(targets[1].getAlias(), "empid"); + assertEquals(targets[1].getEvalTree().getType(), EvalType.CONST); + assertEquals(targets[2].getNamedColumn().getQualifiedName(), "default.dept.deptname"); + } + private static InsertNode getInsertNode(LogicalPlan plan) { LogicalRootNode root = plan.getRootBlock().getRoot(); assertEquals(NodeType.INSERT, root.getChild().getType()); diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestInsertQuery.java b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestInsertQuery.java index 0799d22816..72cbf871b6 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestInsertQuery.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestInsertQuery.java @@ -817,4 +817,23 @@ public final void testInsertOverwriteLocationWithUnionDifferenceAlias() throws E assertNotNull(resultDatas); assertEquals(expected, resultDatas); } + + @Test + public final void testInsertWithDifferentColumnOrder() throws Exception { + ResultSet res = executeFile("nation_diff_col_order.ddl"); + res.close(); + + CatalogService catalog = testingCluster.getMaster().getCatalog(); + assertTrue(catalog.existsTable(getCurrentDatabase(), "nation_diff")); + + try { + res = executeFile("testInsertWithDifferentColumnOrder.sql"); + res.close(); + + res = executeString("select * from nation_diff"); + assertResultSet(res); + } finally { + executeString("drop table nation_diff purge;"); + } + } } diff --git a/tajo-core/src/test/resources/queries/TestInsertQuery/nation_diff_col_order.ddl b/tajo-core/src/test/resources/queries/TestInsertQuery/nation_diff_col_order.ddl new file mode 100644 index 0000000000..6998304d5d --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestInsertQuery/nation_diff_col_order.ddl @@ -0,0 +1 @@ +create table nation_diff (n_nationkey int8, n_name text, n_regionkey int8, n_comment text); \ No newline at end of file diff --git a/tajo-core/src/test/resources/queries/TestInsertQuery/testInsertWithDifferentColumnOrder.sql b/tajo-core/src/test/resources/queries/TestInsertQuery/testInsertWithDifferentColumnOrder.sql new file mode 100644 index 0000000000..ad360f9504 --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestInsertQuery/testInsertWithDifferentColumnOrder.sql @@ -0,0 +1 @@ +insert overwrite into nation_diff (n_comment, n_name) select n_comment, n_name from default.nation; \ No newline at end of file diff --git a/tajo-core/src/test/resources/results/TestInsertQuery/testInsertWithDifferentColumnOrder.result b/tajo-core/src/test/resources/results/TestInsertQuery/testInsertWithDifferentColumnOrder.result new file mode 100644 index 0000000000..4cd3b810e5 --- /dev/null +++ b/tajo-core/src/test/resources/results/TestInsertQuery/testInsertWithDifferentColumnOrder.result @@ -0,0 +1,27 @@ +n_nationkey,n_name,n_regionkey,n_comment +------------------------------- +null,ALGERIA,null, haggle. carefully final deposits detect slyly agai +null,ARGENTINA,null,al foxes promise slyly according to the regular accounts. bold requests alon +null,BRAZIL,null,y alongside of the pending deposits. carefully special packages are about the ironic forges. slyly special +null,CANADA,null,eas hang ironic, silent packages. slyly regular packages are furiously over the tithes. fluffily bold +null,EGYPT,null,y above the carefully unusual theodolites. final dugouts are quickly across the furiously regular d +null,ETHIOPIA,null,ven packages wake quickly. regu +null,FRANCE,null,refully final requests. regular, ironi +null,GERMANY,null,l platelets. regular accounts x-ray: unusual, regular acco +null,INDIA,null,ss excuses cajole slyly across the packages. deposits print aroun +null,INDONESIA,null, slyly express asymptotes. regular deposits haggle slyly. carefully ironic hockey players sleep blithely. carefull +null,IRAN,null,efully alongside of the slyly final dependencies. +null,IRAQ,null,nic deposits boost atop the quickly final requests? quickly regula +null,JAPAN,null,ously. final, express gifts cajole a +null,JORDAN,null,ic deposits are blithely about the carefully regular pa +null,KENYA,null, pending excuses haggle furiously deposits. pending, express pinto beans wake fluffily past t +null,MOROCCO,null,rns. blithely bold courts among the closely regular packages use furiously bold platelets? +null,MOZAMBIQUE,null,s. ironic, unusual asymptotes wake blithely r +null,PERU,null,platelets. blithely pending dependencies use fluffily across the even pinto beans. carefully silent accoun +null,CHINA,null,c dependencies. furiously express notornis sleep slyly regular accounts. ideas sleep. depos +null,ROMANIA,null,ular asymptotes are about the furious multipliers. express dependencies nag above the ironically ironic account +null,SAUDI ARABIA,null,ts. silent requests haggle. closely express packages sleep across the blithely +null,VIETNAM,null,hely enticingly express accounts. even, final +null,RUSSIA,null, requests against the platelets use never according to the quickly regular pint +null,UNITED KINGDOM,null,eans boost carefully special requests. accounts are. carefull +null,UNITED STATES,null,y final packages. slow foxes cajole quickly. quickly silent platelets breach ironic accounts. unusual pinto be \ No newline at end of file diff --git a/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java b/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java index 14fea08645..a2621009b9 100644 --- a/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java +++ b/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java @@ -1514,7 +1514,7 @@ private InsertNode buildInsertIntoTablePlan(PlanContext context, InsertNode inse // we use only a sequence of preceding columns of target table's schema // as target columns. // - // For example, consider a target table and an 'insert into' query are give as follows: + // For example, consider a target table and an 'insert into' query are given as follows: // // CREATE TABLE TB1 (col1 int, col2 int, col3 long); // || || @@ -1576,11 +1576,12 @@ private void buildProjectedInsert(PlanContext context, InsertNode insertNode) { // Modifying projected columns by adding NULL constants // It is because that table appender does not support target columns to be written. List targets = TUtil.newList(); - for (int i = 0, j = 0; i < tableSchema.size(); i++) { + for (int i = 0; i < tableSchema.size(); i++) { Column column = tableSchema.getColumn(i); - if(targetColumns.contains(column) && j < projectionNode.getTargets().length) { - targets.add(projectionNode.getTargets()[j++]); + int idxInProjectionNode = targetColumns.getIndex(column); + if (idxInProjectionNode >= 0 && idxInProjectionNode < projectionNode.getTargets().length) { + targets.add(projectionNode.getTargets()[idxInProjectionNode]); } else { targets.add(new Target(new ConstEval(NullDatum.get()), column.getSimpleName())); } From 556498f58451c9f17c1e734390c159fdf41b88b4 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 8 May 2015 15:12:15 +0900 Subject: [PATCH 030/141] TAJO-1576: Sometimes DefaultTajoCliOutputFormatter.parseErrorMessage() eliminates an important kind of information. Signed-off-by: Jihoon Son --- CHANGES | 4 ++++ .../cli/tsql/DefaultTajoCliOutputFormatter.java | 4 +--- .../cli/tsql/TestDefaultCliOutputFormatter.java | 13 +++++++++---- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/CHANGES b/CHANGES index de1144e0c9..47af73b1c4 100644 --- a/CHANGES +++ b/CHANGES @@ -9,6 +9,10 @@ Release 0.10.1 - unreleased IMPROVEMENT + TAJO-1576: Sometimes DefaultTajoCliOutputFormatter.parseErrorMessage() eliminates + an important kind of information. + (Contributed by Jongyoung Park, Committed by jihoon) + TAJO-1381: Support multi-bytes delimiter for Text file. (Contributed by navis, Committed by jinho) diff --git a/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/DefaultTajoCliOutputFormatter.java b/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/DefaultTajoCliOutputFormatter.java index 5cbe77b0b4..33624e54dd 100644 --- a/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/DefaultTajoCliOutputFormatter.java +++ b/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/DefaultTajoCliOutputFormatter.java @@ -196,10 +196,8 @@ public static String parseErrorMessage(String message) { if (message == null) { return TajoCli.ERROR_PREFIX + "No error message"; } - String[] lines = message.split("\n"); - message = lines[0]; - int index = message.lastIndexOf(TajoCli.ERROR_PREFIX); + int index = message.indexOf(TajoCli.ERROR_PREFIX); if (index < 0) { message = TajoCli.ERROR_PREFIX + message; } else { diff --git a/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestDefaultCliOutputFormatter.java b/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestDefaultCliOutputFormatter.java index df709c5d10..d0e43413de 100644 --- a/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestDefaultCliOutputFormatter.java +++ b/tajo-core/src/test/java/org/apache/tajo/cli/tsql/TestDefaultCliOutputFormatter.java @@ -101,13 +101,19 @@ public void testParseErrorMessage() { "\tat org.apache.tajo.rpc.ServerCallable.withRetries(ServerCallable.java:97)\n" + "\t... 6 more"; - assertEquals("ERROR: no such a table: table1", DefaultTajoCliOutputFormatter.parseErrorMessage(multiLineMessage)); + assertEquals(multiLineMessage, DefaultTajoCliOutputFormatter.parseErrorMessage(multiLineMessage)); + + String noPrefixMessage = "RTFM please"; + assertEquals("ERROR: "+noPrefixMessage, DefaultTajoCliOutputFormatter.parseErrorMessage(noPrefixMessage)); + + String errorMessageWithLine = "ERROR: syntax error at or near '('\n" + + "LINE 1:7 select (*) from tc\n" + + " ^"; + assertEquals(errorMessageWithLine, DefaultTajoCliOutputFormatter.parseErrorMessage(errorMessageWithLine)); } @Test public void testPrintResultInsertStatement() throws Exception { - - DefaultTajoCliOutputFormatter outputFormatter = new DefaultTajoCliOutputFormatter(); outputFormatter.init(cliContext); @@ -176,5 +182,4 @@ public void testPrintResultSelectStatement() throws Exception { assertEquals(expectedOutput, stringWriter.toString()); } - } From facbf43235e07426696393e955ab6b3fd610d4ba Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Fri, 8 May 2015 00:52:09 -0700 Subject: [PATCH 031/141] TAJO-1452: Improve function listing order (Contributed Dongjoon Hyun, Committed by hyunsik) Closes #470 --- CHANGES | 8 +- .../tajo/function/FunctionSignature.java | 10 +- .../apache/tajo/function/FunctionUtil.java | 41 + .../tajo-hcatalog/pom.xml | 739 --------------- .../tajo/catalog/store/HCatalogStore.java | 891 ------------------ .../store/HCatalogStoreClientPool.java | 170 ---- .../tajo/catalog/store/HCatalogUtil.java | 147 --- .../tajo/catalog/store/TestHCatalogStore.java | 402 -------- .../tsql/commands/DescFunctionCommand.java | 18 +- .../java/org/apache/tajo/util/JSPUtil.java | 14 - .../resources/webapps/admin/functions.jsp | 3 +- 11 files changed, 59 insertions(+), 2384 deletions(-) delete mode 100644 tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml delete mode 100644 tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStore.java delete mode 100644 tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStoreClientPool.java delete mode 100644 tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogUtil.java delete mode 100644 tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/test/java/org/apache/tajo/catalog/store/TestHCatalogStore.java diff --git a/CHANGES b/CHANGES index 47af73b1c4..05ea1f2758 100644 --- a/CHANGES +++ b/CHANGES @@ -9,9 +9,11 @@ Release 0.10.1 - unreleased IMPROVEMENT - TAJO-1576: Sometimes DefaultTajoCliOutputFormatter.parseErrorMessage() eliminates - an important kind of information. - (Contributed by Jongyoung Park, Committed by jihoon) + TAJO-1452: Improve function listing order (Contributed Dongjoon Hyun, + Committed by hyunsik) + + TAJO-1576: Sometimes DefaultTajoCliOutputFormatter.parseErrorMessage() eliminates + an important kind of information. (Contributed by Jongyoung Park, Committed by jihoon) TAJO-1381: Support multi-bytes delimiter for Text file. (Contributed by navis, Committed by jinho) diff --git a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/function/FunctionSignature.java b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/function/FunctionSignature.java index fc3a0560dc..89ee01713a 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/function/FunctionSignature.java +++ b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/function/FunctionSignature.java @@ -122,14 +122,20 @@ public int compareTo(FunctionSignature o) { return cmpVal; } - cmpVal = returnType.getType().compareTo(o.returnType.getType()); + cmpVal = functionType.name().compareTo(o.functionType.name()); + + if (cmpVal != 0) { + return cmpVal; + } + + cmpVal = returnType.getType().name().compareTo(o.returnType.getType().name()); if (cmpVal != 0) { return cmpVal; } for (int i = 0; i < Math.min(paramTypes.length, o.paramTypes.length); i++) { - cmpVal = paramTypes[i].getType().compareTo(o.paramTypes[i].getType()); + cmpVal = paramTypes[i].getType().name().compareTo(o.paramTypes[i].getType().name()); if (cmpVal != 0) { return cmpVal; diff --git a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/function/FunctionUtil.java b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/function/FunctionUtil.java index dee5d1c138..70119575e7 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/function/FunctionUtil.java +++ b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/function/FunctionUtil.java @@ -18,12 +18,53 @@ package org.apache.tajo.function; +import org.apache.tajo.catalog.FunctionDesc; +import org.apache.tajo.catalog.proto.CatalogProtos; + import java.util.Collection; +import java.util.Comparator; import static org.apache.tajo.common.TajoDataTypes.DataType; public class FunctionUtil { + public static class FunctionDescProtoComparator implements Comparator + { + @Override + public int compare(CatalogProtos.FunctionDescProto f1, CatalogProtos.FunctionDescProto f2) { + CatalogProtos.FunctionSignatureProto s1 = f1.getSignature(); + CatalogProtos.FunctionSignatureProto s2 = f2.getSignature(); + + int cmpVal = s1.getName().compareTo(s2.getName()); + + if (cmpVal != 0) { + return cmpVal; + } + + cmpVal = s1.getType().name().compareTo(s2.getType().name()); + + if (cmpVal != 0) { + return cmpVal; + } + + cmpVal = s1.getReturnType().getType().name().compareTo(s2.getReturnType().getType().name()); + + if (cmpVal != 0) { + return cmpVal; + } + + for (int i = 0; i < Math.min(s1.getParameterTypesCount(), s2.getParameterTypesCount()); i++) { + cmpVal = s1.getParameterTypes(i).getType().name().compareTo(s2.getParameterTypes(i).getType().name()); + + if (cmpVal != 0) { + return cmpVal; + } + } + + return s2.getParameterTypesCount() - s1.getParameterTypesCount(); + } + } + public static String buildFQFunctionSignature(String funcName, DataType returnType, DataType... paramTypes) { return returnType.getType().name().toLowerCase() + " " + buildSimpleFunctionSignature(funcName, paramTypes); } diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml deleted file mode 100644 index fe8f34a436..0000000000 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml +++ /dev/null @@ -1,739 +0,0 @@ - - - - - - tajo-project - org.apache.tajo - 0.11.0-SNAPSHOT - ../../../tajo-project - - 4.0.0 - tajo-hcatalog - jar - Tajo Catalog Drivers HCatalog - - UTF-8 - UTF-8 - - - - - - org.apache.maven.plugins - maven-compiler-plugin - - 1.6 - 1.6 - ${project.build.sourceEncoding} - - - - org.apache.rat - apache-rat-plugin - - - verify - - check - - - - - - derby.log - - - - - org.apache.maven.plugins - maven-dependency-plugin - - - copy-dependencies - prepare-package - - copy-dependencies - - - runtime - ${project.build.directory}/lib - false - false - true - - - - - - org.apache.maven.plugins - maven-surefire-report-plugin - - - - - - - org.apache.tajo - tajo-common - - - org.apache.tajo - tajo-catalog-common - - - org.apache.tajo - tajo-catalog-client - - - org.apache.tajo - tajo-catalog-server - - - org.apache.tajo - tajo-rpc - - - org.apache.tajo - tajo-storage-common - - - junit - junit - test - - - org.apache.thrift - libfb303 - 0.9.0 - provided - - - org.apache.thrift - libthrift - 0.9.0 - provided - - - org.apache.hadoop - hadoop-mapreduce-client-core - ${hadoop.version} - provided - - - org.apache.hadoop - hadoop-common - ${hadoop.version} - provided - - - - - - hcatalog-0.12.0 - - false - - - 0.12.0 - 1.5.0 - 2.1.0 - - - - org.apache.hive - hive-exec - ${hive.version} - provided - - - org.apache.hive - hive-common - - - org.apache.hive - hive-contrib - - - org.apache.hive - hive-hbase-handler - - - org.apache.hive - hive-metastore - - - org.apache.hive - hive-serde - - - org.apache.hive - hive-shims - - - org.apache.hive - hive-testutils - - - org.apache.thrift - libfb303 - - - org.apache.thrift - libthrift - - - com.jolbox - bonecp - - - com.google.protobuf - protobuf-java - - - - - org.apache.hive - hive-metastore - ${hive.version} - provided - - - org.apache.hive - hive-common - - - org.apache.hive - hive-serde - - - org.apache.hive - hive-shimss - - - org.apache.thrift - libfb303 - - - org.apache.thrift - libthrift - - - com.jolbox - bonecp - - - - - org.apache.hive - hive-cli - ${hive.version} - provided - - - org.apache.hive - hive-common - - - org.apache.hive - hive-exec - - - org.apache.hive - hive-metastore - - - org.apache.hive - hive-serde - - - org.apache.hive - hive-service - - - org.apache.hive - hive-shims - - - com.jolbox - bonecp - - - jline - jline - - - - - org.apache.hive.hcatalog - hcatalog-core - ${hive.version} - - - org.apache.hive - hive-cli - - - org.apache.hive - hive-common - - - org.apache.hive - hive-exec - - - org.apache.hive - hive-metastore - - - org.apache.hive - hive-serde - - - org.apache.hive - hive-service - - - org.apache.hive - hive-shims - - - com.jolbox - bonecp - - - - - com.twitter - parquet-hive-bundle - ${parquet.version} - - - - - hcatalog-0.13.0 - - false - - - 0.13.0 - 1.5.0 - 2.1.0 - - - - org.apache.hive - hive-exec - ${hive.version} - provided - - - org.apache.hive - hive-common - - - org.apache.hive - hive-contrib - - - org.apache.hive - hive-hbase-handler - - - org.apache.hive - hive-metastore - - - org.apache.hive - hive-serde - - - org.apache.hive - hive-shims - - - org.apache.hive - hive-testutils - - - org.apache.thrift - libfb303 - - - org.apache.thrift - libthrift - - - com.jolbox - bonecp - - - com.google.protobuf - protobuf-java - - - - - org.apache.hive - hive-metastore - ${hive.version} - provided - - - org.apache.hive - hive-common - - - org.apache.hive - hive-serde - - - org.apache.hive - hive-shimss - - - org.apache.thrift - libfb303 - - - org.apache.thrift - libthrift - - - com.jolbox - bonecp - - - - - org.apache.hive - hive-cli - ${hive.version} - provided - - - org.apache.hive - hive-common - - - org.apache.hive - hive-exec - - - org.apache.hive - hive-metastore - - - org.apache.hive - hive-serde - - - org.apache.hive - hive-service - - - org.apache.hive - hive-shims - - - com.jolbox - bonecp - - - jline - jline - - - - - org.apache.hive.hcatalog - hive-hcatalog-core - ${hive.version} - - - org.apache.hive - hive-cli - - - org.apache.hive - hive-common - - - org.apache.hive - hive-exec - - - org.apache.hive - hive-metastore - - - com.google.guava - guava - - - org.codehaus.jackson - jackson-mapper-asl - - - - - com.twitter - parquet-hive-bundle - ${parquet.version} - - - - - hcatalog-0.13.1 - - false - - - 0.13.1 - 1.5.0 - 2.1.0 - - - - org.apache.hive - hive-exec - ${hive.version} - provided - - - org.apache.hive - hive-common - - - org.apache.hive - hive-contrib - - - org.apache.hive - hive-hbase-handler - - - org.apache.hive - hive-metastore - - - org.apache.hive - hive-serde - - - org.apache.hive - hive-shims - - - org.apache.hive - hive-testutils - - - org.apache.thrift - libfb303 - - - org.apache.thrift - libthrift - - - com.jolbox - bonecp - - - com.google.protobuf - protobuf-java - - - - - org.apache.hive - hive-metastore - ${hive.version} - provided - - - org.apache.hive - hive-common - - - org.apache.hive - hive-serde - - - org.apache.hive - hive-shimss - - - org.apache.thrift - libfb303 - - - org.apache.thrift - libthrift - - - com.jolbox - bonecp - - - - - org.apache.hive - hive-cli - ${hive.version} - provided - - - org.apache.hive - hive-common - - - org.apache.hive - hive-exec - - - org.apache.hive - hive-metastore - - - org.apache.hive - hive-serde - - - org.apache.hive - hive-service - - - org.apache.hive - hive-shims - - - com.jolbox - bonecp - - - jline - jline - - - - - org.apache.hive.hcatalog - hive-hcatalog-core - ${hive.version} - - - org.apache.hive - hive-cli - - - org.apache.hive - hive-common - - - org.apache.hive - hive-exec - - - org.apache.hive - hive-metastore - - - com.google.guava - guava - - - org.codehaus.jackson - jackson-mapper-asl - - - - - com.twitter - parquet-hive-bundle - ${parquet.version} - - - - - docs - - false - - - - - org.apache.maven.plugins - maven-javadoc-plugin - - - - module-javadocs - package - - jar - - - ${project.build.directory} - - - - - - - - - src - - false - - - - - org.apache.maven.plugins - maven-source-plugin - - - - hadoop-java-sources - package - - jar-no-fork - - - - - - - - - - - - - org.apache.maven.plugins - maven-surefire-report-plugin - - - - - diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStore.java deleted file mode 100644 index 2c3fc6ac1a..0000000000 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStore.java +++ /dev/null @@ -1,891 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.catalog.store; - -import com.google.common.collect.Lists; - -import org.apache.commons.lang.StringEscapeUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.metastore.TableType; -import org.apache.hadoop.hive.metastore.api.*; -import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; -import org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe; -import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; -import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe; -import org.apache.hcatalog.common.HCatUtil; -import org.apache.hcatalog.data.schema.HCatFieldSchema; -import org.apache.hcatalog.data.schema.HCatSchema; -import org.apache.tajo.TajoConstants; -import org.apache.tajo.catalog.*; -import org.apache.tajo.catalog.exception.*; -import org.apache.tajo.catalog.partition.PartitionMethodDesc; -import org.apache.tajo.catalog.proto.CatalogProtos; -import org.apache.tajo.catalog.proto.CatalogProtos.ColumnProto; -import org.apache.tajo.catalog.proto.CatalogProtos.DatabaseProto; -import org.apache.tajo.catalog.proto.CatalogProtos.IndexProto; -import org.apache.tajo.catalog.proto.CatalogProtos.TableDescriptorProto; -import org.apache.tajo.catalog.proto.CatalogProtos.TableOptionProto; -import org.apache.tajo.catalog.proto.CatalogProtos.TablePartitionProto; -import org.apache.tajo.catalog.proto.CatalogProtos.TableStatsProto; -import org.apache.tajo.catalog.proto.CatalogProtos.TablespaceProto; -import org.apache.tajo.catalog.statistics.TableStats; -import org.apache.tajo.common.TajoDataTypes; -import org.apache.tajo.common.exception.NotImplementedException; -import org.apache.tajo.conf.TajoConf; -import org.apache.tajo.exception.InternalException; -import org.apache.tajo.storage.StorageConstants; -import org.apache.tajo.util.KeyValueSet; -import org.apache.thrift.TException; - -import java.io.IOException; -import java.util.*; - -import static org.apache.tajo.catalog.proto.CatalogProtos.PartitionType; - -public class HCatalogStore extends CatalogConstants implements CatalogStore { - protected final Log LOG = LogFactory.getLog(getClass()); - - private static String HIVE_WAREHOUSE_DIR_CONF_KEY = "hive.metastore.warehouse.dir"; - - protected Configuration conf; - private static final int CLIENT_POOL_SIZE = 2; - private final HCatalogStoreClientPool clientPool; - private final String defaultTableSpaceUri; - - public HCatalogStore(final Configuration conf) throws InternalException { - if (!(conf instanceof TajoConf)) { - throw new CatalogException("Invalid Configuration Type:" + conf.getClass().getSimpleName()); - } - this.conf = conf; - this.defaultTableSpaceUri = TajoConf.getWarehouseDir((TajoConf) conf).toString(); - this.clientPool = new HCatalogStoreClientPool(CLIENT_POOL_SIZE, conf); - } - - @Override - public boolean existTable(final String databaseName, final String tableName) throws CatalogException { - boolean exist = false; - org.apache.hadoop.hive.ql.metadata.Table table; - HCatalogStoreClientPool.HCatalogStoreClient client = null; - - // get table - try { - client = clientPool.getClient(); - table = HCatalogUtil.getTable(client.getHiveClient(), databaseName, tableName); - if (table != null) { - exist = true; - } - } catch (NoSuchObjectException nsoe) { - exist = false; - } catch (Exception e) { - throw new CatalogException(e); - } finally { - if (client != null) { - client.release(); - } - } - - return exist; - } - - @Override - public final CatalogProtos.TableDescProto getTable(String databaseName, final String tableName) throws CatalogException { - org.apache.hadoop.hive.ql.metadata.Table table = null; - HCatalogStoreClientPool.HCatalogStoreClient client = null; - Path path = null; - CatalogProtos.StoreType storeType = null; - org.apache.tajo.catalog.Schema schema = null; - KeyValueSet options = null; - TableStats stats = null; - PartitionMethodDesc partitions = null; - - ////////////////////////////////// - // set tajo table schema. - ////////////////////////////////// - try { - // get hive table schema - try { - client = clientPool.getClient(); - table = HCatalogUtil.getTable(client.getHiveClient(), databaseName, tableName); - path = table.getPath(); - } catch (NoSuchObjectException nsoe) { - throw new CatalogException("Table not found. - tableName:" + tableName, nsoe); - } catch (Exception e) { - throw new CatalogException(e); - } - - // convert hcatalog field schema into tajo field schema. - schema = new org.apache.tajo.catalog.Schema(); - HCatSchema tableSchema = null; - - try { - tableSchema = HCatUtil.getTableSchemaWithPtnCols(table); - } catch (IOException ioe) { - throw new CatalogException("Fail to get table schema. - tableName:" + tableName, ioe); - } - List fieldSchemaList = tableSchema.getFields(); - boolean isPartitionKey = false; - for (HCatFieldSchema eachField : fieldSchemaList) { - isPartitionKey = false; - - if (table.getPartitionKeys() != null) { - for (FieldSchema partitionKey : table.getPartitionKeys()) { - if (partitionKey.getName().equals(eachField.getName())) { - isPartitionKey = true; - } - } - } - - if (!isPartitionKey) { - String fieldName = databaseName + CatalogConstants.IDENTIFIER_DELIMITER + tableName + - CatalogConstants.IDENTIFIER_DELIMITER + eachField.getName(); - TajoDataTypes.Type dataType = HCatalogUtil.getTajoFieldType(eachField.getType().toString()); - schema.addColumn(fieldName, dataType); - } - } - - // validate field schema. - try { - HCatalogUtil.validateHCatTableAndTajoSchema(tableSchema); - } catch (Exception e) { - throw new CatalogException("HCatalog cannot support schema. - schema:" + tableSchema.toString(), e); - } - - stats = new TableStats(); - options = new KeyValueSet(); - options.putAll(table.getParameters()); - options.remove("EXTERNAL"); - - Properties properties = table.getMetadata(); - if (properties != null) { - // set field delimiter - String fieldDelimiter = "", nullFormat = ""; - if (properties.getProperty(serdeConstants.FIELD_DELIM) != null) { - fieldDelimiter = properties.getProperty(serdeConstants.FIELD_DELIM); - } else { - // if hive table used default row format delimiter, Properties doesn't have it. - // So, Tajo must set as follows: - fieldDelimiter = "\u0001"; - } - - // set null format - if (properties.getProperty(serdeConstants.SERIALIZATION_NULL_FORMAT) != null) { - nullFormat = properties.getProperty(serdeConstants.SERIALIZATION_NULL_FORMAT); - } else { - nullFormat = "\\N"; - } - options.remove(serdeConstants.SERIALIZATION_NULL_FORMAT); - - // set file output format - String fileOutputformat = properties.getProperty(hive_metastoreConstants.FILE_OUTPUT_FORMAT); - storeType = CatalogUtil.getStoreType(HCatalogUtil.getStoreType(fileOutputformat)); - - if (storeType.equals(CatalogProtos.StoreType.TEXTFILE)) { - options.set(StorageConstants.TEXT_DELIMITER, StringEscapeUtils.escapeJava(fieldDelimiter)); - options.set(StorageConstants.TEXT_NULL, StringEscapeUtils.escapeJava(nullFormat)); - } else if (storeType.equals(CatalogProtos.StoreType.RCFILE)) { - options.set(StorageConstants.RCFILE_NULL, StringEscapeUtils.escapeJava(nullFormat)); - String serde = properties.getProperty(serdeConstants.SERIALIZATION_LIB); - if (LazyBinaryColumnarSerDe.class.getName().equals(serde)) { - options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE); - } else if (ColumnarSerDe.class.getName().equals(serde)) { - options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE); - } - } else if (storeType.equals(CatalogProtos.StoreType.SEQUENCEFILE) ) { - options.set(StorageConstants.SEQUENCEFILE_DELIMITER, StringEscapeUtils.escapeJava(fieldDelimiter)); - options.set(StorageConstants.SEQUENCEFILE_NULL, StringEscapeUtils.escapeJava(nullFormat)); - String serde = properties.getProperty(serdeConstants.SERIALIZATION_LIB); - if (LazyBinarySerDe.class.getName().equals(serde)) { - options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE); - } else if (LazySimpleSerDe.class.getName().equals(serde)) { - options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE); - } - } - - // set data size - long totalSize = 0; - if (properties.getProperty("totalSize") != null) { - totalSize = Long.parseLong(properties.getProperty("totalSize")); - } else { - try { - FileSystem fs = path.getFileSystem(conf); - if (fs.exists(path)) { - totalSize = fs.getContentSummary(path).getLength(); - } - } catch (IOException ioe) { - throw new CatalogException("Fail to get path. - path:" + path.toString(), ioe); - } - } - stats.setNumBytes(totalSize); - } - - // set partition keys - List partitionKeys = table.getPartitionKeys(); - - if (null != partitionKeys) { - org.apache.tajo.catalog.Schema expressionSchema = new org.apache.tajo.catalog.Schema(); - StringBuilder sb = new StringBuilder(); - if (partitionKeys.size() > 0) { - for (int i = 0; i < partitionKeys.size(); i++) { - FieldSchema fieldSchema = partitionKeys.get(i); - TajoDataTypes.Type dataType = HCatalogUtil.getTajoFieldType(fieldSchema.getType().toString()); - String fieldName = databaseName + CatalogConstants.IDENTIFIER_DELIMITER + tableName + - CatalogConstants.IDENTIFIER_DELIMITER + fieldSchema.getName(); - expressionSchema.addColumn(new Column(fieldName, dataType)); - if (i > 0) { - sb.append(","); - } - sb.append(fieldSchema.getName()); - } - partitions = new PartitionMethodDesc( - databaseName, - tableName, - PartitionType.COLUMN, - sb.toString(), - expressionSchema); - } - } - } finally { - if(client != null) client.release(); - } - TableMeta meta = new TableMeta(storeType, options); - TableDesc tableDesc = new TableDesc(databaseName + "." + tableName, schema, meta, path.toUri()); - if (table.getTableType().equals(TableType.EXTERNAL_TABLE)) { - tableDesc.setExternal(true); - } - if (stats != null) { - tableDesc.setStats(stats); - } - if (partitions != null) { - tableDesc.setPartitionMethod(partitions); - } - return tableDesc.getProto(); - } - - - private TajoDataTypes.Type getDataType(final String typeStr) { - try { - return Enum.valueOf(TajoDataTypes.Type.class, typeStr); - } catch (IllegalArgumentException iae) { - LOG.error("Cannot find a matched type against from '" + typeStr + "'"); - return null; - } - } - - @Override - public final List getAllTableNames(String databaseName) throws CatalogException { - HCatalogStoreClientPool.HCatalogStoreClient client = null; - - try { - client = clientPool.getClient(); - return client.getHiveClient().getAllTables(databaseName); - } catch (TException e) { - throw new CatalogException(e); - } finally { - if(client != null) client.release(); - } - } - - @Override - public void createTablespace(String spaceName, String spaceUri) throws CatalogException { - // SKIP - } - - @Override - public boolean existTablespace(String spaceName) throws CatalogException { - // SKIP - return spaceName.equals(TajoConstants.DEFAULT_TABLESPACE_NAME); - } - - @Override - public void dropTablespace(String spaceName) throws CatalogException { - // SKIP - } - - @Override - public Collection getAllTablespaceNames() throws CatalogException { - return Lists.newArrayList(TajoConstants.DEFAULT_TABLESPACE_NAME); - } - - @Override - public TablespaceProto getTablespace(String spaceName) throws CatalogException { - if (spaceName.equals(TajoConstants.DEFAULT_TABLESPACE_NAME)) { - TablespaceProto.Builder builder = TablespaceProto.newBuilder(); - builder.setSpaceName(TajoConstants.DEFAULT_TABLESPACE_NAME); - builder.setUri(defaultTableSpaceUri); - return builder.build(); - } else { - throw new CatalogException("tablespace concept is not supported in HCatalogStore"); - } - } - - @Override - public void updateTableStats(CatalogProtos.UpdateTableStatsProto statsProto) throws - CatalogException { - // TODO - not implemented yet - } - - @Override - public void alterTablespace(CatalogProtos.AlterTablespaceProto alterProto) throws CatalogException { - throw new CatalogException("tablespace concept is not supported in HCatalogStore"); - } - - @Override - public void createDatabase(String databaseName, String tablespaceName) throws CatalogException { - HCatalogStoreClientPool.HCatalogStoreClient client = null; - - try { - Database database = new Database( - databaseName, - "", - defaultTableSpaceUri + "/" + databaseName, - new HashMap()); - client = clientPool.getClient(); - client.getHiveClient().createDatabase(database); - } catch (AlreadyExistsException e) { - throw new AlreadyExistsDatabaseException(databaseName); - } catch (Throwable t) { - throw new CatalogException(t); - } finally { - if (client != null) { - client.release(); - } - } - } - - @Override - public boolean existDatabase(String databaseName) throws CatalogException { - HCatalogStoreClientPool.HCatalogStoreClient client = null; - - try { - client = clientPool.getClient(); - List databaseNames = client.getHiveClient().getAllDatabases(); - return databaseNames.contains(databaseName); - } catch (Throwable t) { - throw new CatalogException(t); - } finally { - if (client != null) { - client.release(); - } - } - } - - @Override - public void dropDatabase(String databaseName) throws CatalogException { - HCatalogStoreClientPool.HCatalogStoreClient client = null; - - try { - client = clientPool.getClient(); - client.getHiveClient().dropDatabase(databaseName); - } catch (NoSuchObjectException e) { - throw new NoSuchDatabaseException(databaseName); - } catch (Throwable t) { - throw new CatalogException(databaseName); - } finally { - if (client != null) { - client.release(); - } - } - } - - @Override - public Collection getAllDatabaseNames() throws CatalogException { - HCatalogStoreClientPool.HCatalogStoreClient client = null; - - try { - client = clientPool.getClient(); - return client.getHiveClient().getAllDatabases(); - } catch (TException e) { - throw new CatalogException(e); - } finally { - if (client != null) { - client.release(); - } - } - } - - @Override - public final void createTable(final CatalogProtos.TableDescProto tableDescProto) throws CatalogException { - HCatalogStoreClientPool.HCatalogStoreClient client = null; - - TableDesc tableDesc = new TableDesc(tableDescProto); - String[] splitted = CatalogUtil.splitFQTableName(tableDesc.getName()); - String databaseName = splitted[0]; - String tableName = splitted[1]; - - try { - client = clientPool.getClient(); - - org.apache.hadoop.hive.metastore.api.Table table = new org.apache.hadoop.hive.metastore.api.Table(); - table.setDbName(databaseName); - table.setTableName(tableName); - table.setParameters(new HashMap(tableDesc.getMeta().getOptions().getAllKeyValus())); - // TODO: set owner - //table.setOwner(); - - StorageDescriptor sd = new StorageDescriptor(); - sd.setSerdeInfo(new SerDeInfo()); - sd.getSerdeInfo().setParameters(new HashMap()); - sd.getSerdeInfo().setName(table.getTableName()); - - // if tajo set location method, thrift client make exception as follows: - // Caused by: MetaException(message:java.lang.NullPointerException) - // If you want to modify table path, you have to modify on Hive cli. - if (tableDesc.isExternal()) { - table.setTableType(TableType.EXTERNAL_TABLE.name()); - table.putToParameters("EXTERNAL", "TRUE"); - - Path tablePath = new Path(tableDesc.getPath()); - FileSystem fs = tablePath.getFileSystem(conf); - if (fs.isFile(tablePath)) { - LOG.warn("A table path is a file, but HCatalog does not allow a file path."); - sd.setLocation(tablePath.getParent().toString()); - } else { - sd.setLocation(tablePath.toString()); - } - } - - // set column information - List columns = tableDesc.getSchema().getColumns(); - ArrayList cols = new ArrayList(columns.size()); - - for (Column eachField : columns) { - cols.add(new FieldSchema(eachField.getSimpleName(), - HCatalogUtil.getHiveFieldType(eachField.getDataType()), "")); - } - sd.setCols(cols); - - // set partition keys - if (tableDesc.hasPartition() && tableDesc.getPartitionMethod().getPartitionType().equals(PartitionType.COLUMN)) { - List partitionKeys = new ArrayList(); - for (Column eachPartitionKey : tableDesc.getPartitionMethod().getExpressionSchema().getColumns()) { - partitionKeys.add(new FieldSchema(eachPartitionKey.getSimpleName(), - HCatalogUtil.getHiveFieldType(eachPartitionKey.getDataType()), "")); - } - table.setPartitionKeys(partitionKeys); - } - - if (tableDesc.getMeta().getStoreType().equals(CatalogProtos.StoreType.RCFILE)) { - String serde = tableDesc.getMeta().getOption(StorageConstants.RCFILE_SERDE); - sd.setInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat.class.getName()); - sd.setOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat.class.getName()); - if (StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)) { - sd.getSerdeInfo().setSerializationLib(org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class.getName()); - } else { - sd.getSerdeInfo().setSerializationLib( - org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe.class.getName()); - } - - if (tableDesc.getMeta().getOptions().containsKey(StorageConstants.RCFILE_NULL)) { - table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT, - StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.RCFILE_NULL))); - } - } else if (tableDesc.getMeta().getStoreType().equals(CatalogProtos.StoreType.CSV) - || tableDesc.getMeta().getStoreType().equals(CatalogProtos.StoreType.TEXTFILE)) { - sd.getSerdeInfo().setSerializationLib(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.class.getName()); - sd.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class.getName()); - sd.setOutputFormat(org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat.class.getName()); - - String fieldDelimiter = tableDesc.getMeta().getOption(StorageConstants.TEXT_DELIMITER, - StorageConstants.DEFAULT_FIELD_DELIMITER); - - // User can use an unicode for filed delimiter such as \u0001, \001. - // In this case, java console will convert this value into "\\u001". - // And hive will un-espace this value again. - // As a result, user can use right field delimiter. - // So, we have to un-escape this value. - sd.getSerdeInfo().putToParameters(serdeConstants.SERIALIZATION_FORMAT, - StringEscapeUtils.unescapeJava(fieldDelimiter)); - sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, - StringEscapeUtils.unescapeJava(fieldDelimiter)); - table.getParameters().remove(StorageConstants.TEXT_DELIMITER); - - if (tableDesc.getMeta().containsOption(StorageConstants.TEXT_NULL)) { - table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT, - StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.TEXT_NULL))); - table.getParameters().remove(StorageConstants.TEXT_NULL); - } - } else if (tableDesc.getMeta().getStoreType().equals(CatalogProtos.StoreType.SEQUENCEFILE)) { - String serde = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_SERDE); - sd.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class.getName()); - sd.setOutputFormat(org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat.class.getName()); - - if (StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)) { - sd.getSerdeInfo().setSerializationLib(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.class.getName()); - - String fieldDelimiter = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_DELIMITER, - StorageConstants.DEFAULT_FIELD_DELIMITER); - - // User can use an unicode for filed delimiter such as \u0001, \001. - // In this case, java console will convert this value into "\\u001". - // And hive will un-espace this value again. - // As a result, user can use right field delimiter. - // So, we have to un-escape this value. - sd.getSerdeInfo().putToParameters(serdeConstants.SERIALIZATION_FORMAT, - StringEscapeUtils.unescapeJava(fieldDelimiter)); - sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, - StringEscapeUtils.unescapeJava(fieldDelimiter)); - table.getParameters().remove(StorageConstants.SEQUENCEFILE_DELIMITER); - } else { - sd.getSerdeInfo().setSerializationLib(org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe.class.getName()); - } - - if (tableDesc.getMeta().containsOption(StorageConstants.SEQUENCEFILE_NULL)) { - table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT, - StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_NULL))); - table.getParameters().remove(StorageConstants.SEQUENCEFILE_NULL); - } - } else { - if (tableDesc.getMeta().getStoreType().equals(CatalogProtos.StoreType.PARQUET)) { - sd.setInputFormat(parquet.hive.DeprecatedParquetInputFormat.class.getName()); - sd.setOutputFormat(parquet.hive.DeprecatedParquetOutputFormat.class.getName()); - sd.getSerdeInfo().setSerializationLib(parquet.hive.serde.ParquetHiveSerDe.class.getName()); - } else { - throw new CatalogException(new NotImplementedException(tableDesc.getMeta().getStoreType - ().name())); - } - } - - sd.setSortCols(new ArrayList()); - - table.setSd(sd); - client.getHiveClient().createTable(table); - } catch (RuntimeException e) { - throw e; - } catch (Exception e) { - throw new CatalogException(e); - } finally { - if(client != null) client.release(); - } - } - - @Override - public final void dropTable(String databaseName, final String tableName) throws CatalogException { - HCatalogStoreClientPool.HCatalogStoreClient client = null; - - try { - client = clientPool.getClient(); - client.getHiveClient().dropTable(databaseName, tableName, false, false); - } catch (NoSuchObjectException nsoe) { - } catch (Exception e) { - throw new CatalogException(e); - } finally { - if (client != null) { - client.release(); - } - } - } - - - @Override - public void alterTable(final CatalogProtos.AlterTableDescProto alterTableDescProto) throws CatalogException { - final String[] split = CatalogUtil.splitFQTableName(alterTableDescProto.getTableName()); - - if (split.length == 1) { - throw new IllegalArgumentException("alterTable() requires a qualified table name, but it is \"" - + alterTableDescProto.getTableName() + "\"."); - } - - final String databaseName = split[0]; - final String tableName = split[1]; - - - switch (alterTableDescProto.getAlterTableType()) { - case RENAME_TABLE: - if (existTable(databaseName,alterTableDescProto.getNewTableName().toLowerCase())) { - throw new AlreadyExistsTableException(alterTableDescProto.getNewTableName()); - } - renameTable(databaseName, tableName, alterTableDescProto.getNewTableName().toLowerCase()); - break; - case RENAME_COLUMN: - if (existColumn(databaseName,tableName, alterTableDescProto.getAlterColumnName().getNewColumnName())) { - throw new ColumnNameAlreadyExistException(alterTableDescProto.getAlterColumnName().getNewColumnName()); - } - renameColumn(databaseName, tableName, alterTableDescProto.getAlterColumnName()); - break; - case ADD_COLUMN: - if (existColumn(databaseName,tableName, alterTableDescProto.getAddColumn().getName())) { - throw new ColumnNameAlreadyExistException(alterTableDescProto.getAddColumn().getName()); - } - addNewColumn(databaseName, tableName, alterTableDescProto.getAddColumn()); - break; - default: - //TODO - } - } - - - private void renameTable(String databaseName, String tableName, String newTableName) { - HCatalogStoreClientPool.HCatalogStoreClient client = null; - try { - client = clientPool.getClient(); - Table newTable = client.getHiveClient().getTable(databaseName, tableName); - newTable.setTableName(newTableName); - client.getHiveClient().alter_table(databaseName, tableName, newTable); - - } catch (NoSuchObjectException nsoe) { - } catch (Exception e) { - throw new CatalogException(e); - } finally { - if (client != null) { - client.release(); - } - } - } - - private void renameColumn(String databaseName, String tableName, CatalogProtos.AlterColumnProto alterColumnProto) { - HCatalogStoreClientPool.HCatalogStoreClient client = null; - try { - - client = clientPool.getClient(); - Table table = client.getHiveClient().getTable(databaseName, tableName); - List columns = table.getSd().getCols(); - - for (final FieldSchema currentColumn : columns) { - if (currentColumn.getName().equalsIgnoreCase(alterColumnProto.getOldColumnName())) { - currentColumn.setName(alterColumnProto.getNewColumnName()); - } - } - client.getHiveClient().alter_table(databaseName, tableName, table); - - } catch (NoSuchObjectException nsoe) { - } catch (Exception e) { - throw new CatalogException(e); - } finally { - if (client != null) { - client.release(); - } - } - } - - - private void addNewColumn(String databaseName, String tableName, CatalogProtos.ColumnProto columnProto) { - HCatalogStoreClientPool.HCatalogStoreClient client = null; - try { - - client = clientPool.getClient(); - Table table = client.getHiveClient().getTable(databaseName, tableName); - List columns = table.getSd().getCols(); - columns.add(new FieldSchema(columnProto.getName(), - HCatalogUtil.getHiveFieldType(columnProto.getDataType()), "")); - client.getHiveClient().alter_table(databaseName, tableName, table); - - - } catch (NoSuchObjectException nsoe) { - } catch (Exception e) { - throw new CatalogException(e); - } finally { - if (client != null) { - client.release(); - } - } - } - - @Override - public void addPartitionMethod(CatalogProtos.PartitionMethodProto partitionMethodProto) throws CatalogException { - // TODO - not implemented yet - } - - @Override - public CatalogProtos.PartitionMethodProto getPartitionMethod(String databaseName, String tableName) - throws CatalogException { - return null; // TODO - not implemented yet - } - - @Override - public boolean existPartitionMethod(String databaseName, String tableName) throws CatalogException { - return false; // TODO - not implemented yet - } - - @Override - public void dropPartitionMethod(String databaseName, String tableName) throws CatalogException { - // TODO - not implemented yet - } - - @Override - public void addPartitions(CatalogProtos.PartitionsProto partitionsProto) throws CatalogException { - // TODO - not implemented yet - } - - @Override - public void addPartition(String databaseName, String tableName, CatalogProtos.PartitionDescProto partitionDescProto) throws CatalogException { - - } - - @Override - public CatalogProtos.PartitionsProto getPartitions(String tableName) throws CatalogException { - return null; // TODO - not implemented yet - } - - @Override - public CatalogProtos.PartitionDescProto getPartition(String partitionName) throws CatalogException { - return null; // TODO - not implemented yet - } - - @Override - public void delPartition(String partitionName) throws CatalogException { - // TODO - not implemented yet - } - - @Override - public void dropPartitions(String tableName) throws CatalogException { - - } - - - @Override - public final void addFunction(final FunctionDesc func) throws CatalogException { - // TODO - not implemented yet - } - - @Override - public final void deleteFunction(final FunctionDesc func) throws CatalogException { - // TODO - not implemented yet - } - - @Override - public final void existFunction(final FunctionDesc func) throws CatalogException { - // TODO - not implemented yet - } - - @Override - public final List getAllFunctionNames() throws CatalogException { - // TODO - not implemented yet - return null; - } - - @Override - public void dropIndex(String databaseName, String indexName) throws CatalogException { - // TODO - not implemented yet - } - - @Override - public boolean existIndexByName(String databaseName, String indexName) throws CatalogException { - // TODO - not implemented yet - return false; - } - - @Override - public CatalogProtos.IndexDescProto[] getIndexes(String databaseName, String tableName) throws CatalogException { - // TODO - not implemented yet - return null; - } - - @Override - public void createIndex(CatalogProtos.IndexDescProto proto) throws CatalogException { - // TODO - not implemented yet - } - - @Override - public CatalogProtos.IndexDescProto getIndexByName(String databaseName, String indexName) throws CatalogException { - // TODO - not implemented yet - return null; - } - - @Override - public CatalogProtos.IndexDescProto getIndexByColumn(String databaseName, String tableName, String columnName) - throws CatalogException { - // TODO - not implemented yet - return null; - } - - @Override - public boolean existIndexByColumn(String databaseName, String tableName, String columnName) throws CatalogException { - // TODO - not implemented yet - return false; - } - - @Override - public final void close() { - clientPool.close(); - } - - private boolean existColumn(final String databaseName ,final String tableName , final String columnName) throws CatalogException { - boolean exist = false; - HCatalogStoreClientPool.HCatalogStoreClient client = null; - - try { - - client = clientPool.getClient(); - Table table = client.getHiveClient().getTable(databaseName, tableName); - List columns = table.getSd().getCols(); - - for (final FieldSchema currentColumn : columns) { - if (currentColumn.getName().equalsIgnoreCase(columnName)) { - exist = true; - } - } - client.getHiveClient().alter_table(databaseName, tableName, table); - - } catch (NoSuchObjectException nsoe) { - } catch (Exception e) { - throw new CatalogException(e); - } finally { - if (client != null) { - client.release(); - } - } - - return exist; - } - - @Override - public List getAllColumns() throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List getAllDatabases() throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List getAllIndexes() throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List getAllPartitions() throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List getAllTableOptions() throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List getAllTableStats() throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List getAllTables() throws CatalogException { - throw new UnsupportedOperationException(); - } - - @Override - public List getTablespaces() throws CatalogException { - throw new UnsupportedOperationException(); - } -} diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStoreClientPool.java b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStoreClientPool.java deleted file mode 100644 index 8ccb100b01..0000000000 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStoreClientPool.java +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright 2012 Cloudera Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package org.apache.tajo.catalog.store; - - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.hadoop.hive.metastore.*; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.Table; -import org.apache.log4j.Logger; - -import java.util.Iterator; -import java.util.Map; -import java.util.Map.Entry; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.atomic.AtomicBoolean; - -/** - * Manages a pool of HiveMetaStoreClient connections. If the connection pool is empty - * a new client is created and added to the pool. There is no size limit. - */ -public class HCatalogStoreClientPool { - private static final Logger LOG = Logger.getLogger(HCatalogStoreClientPool.class); - private final ConcurrentLinkedQueue clientPool = - new ConcurrentLinkedQueue(); - private AtomicBoolean poolClosed = new AtomicBoolean(false); - private HiveConf hiveConf; - - /** - * A wrapper around the HiveMetaStoreClient that manages interactions with the - * connection pool. - */ - public class HCatalogStoreClient { - private final IMetaStoreClient hiveClient; - public AtomicBoolean isInUse = new AtomicBoolean(false); - - private HCatalogStoreClient(HiveConf hiveConf) { - try { - HiveMetaHookLoader hookLoader = new HiveMetaHookLoader() { - @Override - public HiveMetaHook getHook(Table table) throws MetaException { - /* metadata hook implementation, or null if this - * storage handler does not need any metadata notifications - */ - return null; - } - }; - - this.hiveClient = RetryingMetaStoreClient.getProxy(hiveConf, hookLoader, HiveMetaStoreClient.class.getName()); - clientPool.add(this); - LOG.info("MetaStoreClient created (size = " + clientPool.size() + ")"); - } catch (Exception e) { - // Turn in to an unchecked exception - throw new IllegalStateException(e); - } - } - - /** - * Returns the internal HiveMetaStoreClient object. - */ - public IMetaStoreClient getHiveClient() { - return hiveClient; - } - - /** - * Returns this client back to the connection pool. If the connection pool has been - * closed, just close the Hive client connection. - */ - public synchronized void release() { - if(!this.isInUse.getAndSet(false)){ - return; - } - // Ensure the connection isn't returned to the pool if the pool has been closed. - // This lock is needed to ensure proper behavior when a thread reads poolClosed - // is false, but a call to pool.close() comes in immediately afterward. - if (poolClosed.get()) { - this.getHiveClient().close(); - } else { - clientPool.add(this); - } - } - - // Marks this client as in use - private void markInUse() { - isInUse.set(true); - } - } - - public HCatalogStoreClientPool(int initialSize) { - this(initialSize, new HiveConf(HCatalogStoreClientPool.class)); - } - - public HCatalogStoreClientPool(int initialSize, HiveConf hiveConf) { - this.hiveConf = hiveConf; - addClients(initialSize); - } - - public HCatalogStoreClientPool(int initialSize, Configuration conf) { - this.hiveConf = new HiveConf(); - setParameters(conf); - addClients(initialSize); - } - - public void setParameters(Configuration conf) { - for( Iterator> iter = conf.iterator(); iter.hasNext();) { - Map.Entry entry = iter.next(); - this.hiveConf.set(entry.getKey(), entry.getValue()); - } - } - - /** - * Add numClients to the client pool. - */ - public void addClients(int numClients) { - for (int i = 0; i < numClients; ++i) { - clientPool.add(new HCatalogStoreClient(hiveConf)); - } - } - - /** - * Gets a client from the pool. If the pool is empty a new client is created. - */ - public synchronized HCatalogStoreClient getClient() { - // The MetaStoreClient c'tor relies on knowing the Hadoop version by asking - // org.apache.hadoop.util.VersionInfo. The VersionInfo class relies on opening - // the 'common-version-info.properties' file as a resource from hadoop-common*.jar - // using the Thread's context classloader. If necessary, set the Thread's context - // classloader, otherwise VersionInfo will fail in it's c'tor. - if (Thread.currentThread().getContextClassLoader() == null) { - Thread.currentThread().setContextClassLoader(ClassLoader.getSystemClassLoader()); - } - - HCatalogStoreClient client = clientPool.poll(); - // The pool was empty so create a new client and return that. - if (client == null) { - client = new HCatalogStoreClient(hiveConf); - } - client.markInUse(); - - return client; - } - - /** - * Removes all items from the connection pool and closes all Hive Meta Store client - * connections. Can be called multiple times. - */ - public void close() { - // Ensure no more items get added to the pool once close is called. - if (poolClosed.getAndSet(true)) { - return; - } - - HCatalogStoreClient client = null; - while ((client = clientPool.poll()) != null) { - client.getHiveClient().close(); - } - } -} diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogUtil.java b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogUtil.java deleted file mode 100644 index 8e8e58cc51..0000000000 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogUtil.java +++ /dev/null @@ -1,147 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.catalog.store; - -import com.google.common.base.Preconditions; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.metastore.IMetaStoreClient; -import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat; -import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat; -import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; -import org.apache.hadoop.hive.ql.metadata.Table; -import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hcatalog.common.HCatException; -import org.apache.hcatalog.data.schema.HCatFieldSchema; -import org.apache.hcatalog.data.schema.HCatSchema; -import org.apache.tajo.catalog.exception.CatalogException; -import org.apache.tajo.catalog.proto.CatalogProtos; -import org.apache.tajo.catalog.CatalogUtil; -import org.apache.tajo.common.TajoDataTypes; -import org.apache.thrift.TException; -import parquet.hadoop.mapred.DeprecatedParquetOutputFormat; - -public class HCatalogUtil { - protected final Log LOG = LogFactory.getLog(getClass()); - - public static void validateHCatTableAndTajoSchema(HCatSchema tblSchema) throws CatalogException { - for (HCatFieldSchema hcatField : tblSchema.getFields()) { - validateHCatFieldAndTajoSchema(hcatField); - } - } - - private static void validateHCatFieldAndTajoSchema(HCatFieldSchema fieldSchema) throws CatalogException { - try { - HCatFieldSchema.Type fieldType = fieldSchema.getType(); - switch (fieldType) { - case ARRAY: - throw new HCatException("Tajo cannot support array field type."); - case STRUCT: - throw new HCatException("Tajo cannot support struct field type."); - case MAP: - throw new HCatException("Tajo cannot support map field type."); - } - } catch (HCatException e) { - throw new CatalogException("incompatible hcatalog types when assigning to tajo type. - " + - "HCatFieldSchema:" + fieldSchema); - } - } - - public static TajoDataTypes.Type getTajoFieldType(String fieldType) { - Preconditions.checkNotNull(fieldType); - - String typeStr = null; - - if(fieldType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)) - typeStr = "INT4"; - else if(fieldType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)) - typeStr = "INT1"; - else if(fieldType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)) - typeStr = "INT2"; - else if(fieldType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) - typeStr = "INT8"; - else if(fieldType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) - typeStr = "BOOLEAN"; - else if(fieldType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) - typeStr = "FLOAT4"; - else if(fieldType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) - typeStr = "FLOAT8"; - else if(fieldType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)) - typeStr = "TEXT"; - else if(fieldType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) - typeStr = "BLOB"; - - try { - return Enum.valueOf(TajoDataTypes.Type.class, typeStr); - } catch (IllegalArgumentException iae) { - throw new CatalogException("Cannot find a matched type against from '" + typeStr + "'"); - } - } - - public static String getHiveFieldType(TajoDataTypes.DataType dataType) { - Preconditions.checkNotNull(dataType); - - switch (dataType.getType()) { - case CHAR: return serdeConstants.CHAR_TYPE_NAME; - case BOOLEAN: return serdeConstants.BOOLEAN_TYPE_NAME; - case INT1: return serdeConstants.TINYINT_TYPE_NAME; - case INT2: return serdeConstants.SMALLINT_TYPE_NAME; - case INT4: return serdeConstants.INT_TYPE_NAME; - case INT8: return serdeConstants.BIGINT_TYPE_NAME; - case FLOAT4: return serdeConstants.FLOAT_TYPE_NAME; - case FLOAT8: return serdeConstants.DOUBLE_TYPE_NAME; - case TEXT: return serdeConstants.STRING_TYPE_NAME; - case VARCHAR: return serdeConstants.VARCHAR_TYPE_NAME; - case NCHAR: return serdeConstants.VARCHAR_TYPE_NAME; - case NVARCHAR: return serdeConstants.VARCHAR_TYPE_NAME; - case BINARY: return serdeConstants.BINARY_TYPE_NAME; - case VARBINARY: return serdeConstants.BINARY_TYPE_NAME; - case BLOB: return serdeConstants.BINARY_TYPE_NAME; - case DATE: return serdeConstants.DATE_TYPE_NAME; - case TIMESTAMP: return serdeConstants.TIMESTAMP_TYPE_NAME; - default: - throw new CatalogException(dataType + " is not supported."); - } - } - - public static String getStoreType(String fileFormat) { - Preconditions.checkNotNull(fileFormat); - - String[] fileFormatArrary = fileFormat.split("\\."); - if(fileFormatArrary.length < 1) { - throw new CatalogException("Hive file output format is wrong. - file output format:" + fileFormat); - } - - String outputFormatClass = fileFormatArrary[fileFormatArrary.length-1]; - if(outputFormatClass.equals(HiveIgnoreKeyTextOutputFormat.class.getSimpleName())) { - return CatalogUtil.TEXTFILE_NAME; - } else if(outputFormatClass.equals(HiveSequenceFileOutputFormat.class.getSimpleName())) { - return CatalogProtos.StoreType.SEQUENCEFILE.name(); - } else if(outputFormatClass.equals(RCFileOutputFormat.class.getSimpleName())) { - return CatalogProtos.StoreType.RCFILE.name(); - } else if(outputFormatClass.equals(DeprecatedParquetOutputFormat.class.getSimpleName())) { - return CatalogProtos.StoreType.PARQUET.name(); - } else { - throw new CatalogException("Not supported file output format. - file output format:" + fileFormat); - } - } - - public static Table getTable(IMetaStoreClient client, String dbName, String tableName) throws TException { - return new Table(client.getTable(dbName, tableName)); - } -} diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/test/java/org/apache/tajo/catalog/store/TestHCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/test/java/org/apache/tajo/catalog/store/TestHCatalogStore.java deleted file mode 100644 index 725f665394..0000000000 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/test/java/org/apache/tajo/catalog/store/TestHCatalogStore.java +++ /dev/null @@ -1,402 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.catalog.store; - - -import org.apache.commons.lang.StringEscapeUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.conf.HiveConf; -import org.apache.tajo.catalog.CatalogUtil; -import org.apache.tajo.catalog.Schema; -import org.apache.tajo.catalog.TableDesc; -import org.apache.tajo.catalog.TableMeta; -import org.apache.tajo.catalog.partition.PartitionMethodDesc; -import org.apache.tajo.catalog.proto.CatalogProtos; -import org.apache.tajo.common.TajoDataTypes; -import org.apache.tajo.conf.TajoConf; -import org.apache.tajo.storage.StorageConstants; -import org.apache.tajo.util.CommonTestingUtil; -import org.apache.tajo.util.KeyValueSet; -import org.junit.AfterClass; -import org.junit.BeforeClass; -import org.junit.Test; - -import java.io.IOException; -import java.util.List; - -import static org.junit.Assert.*; - -/** - * TestHCatalogStore. Test case for - * {@link org.apache.tajo.catalog.store.HCatalogStore} - */ - -public class TestHCatalogStore { - private static final String DB_NAME = "test_hive"; - private static final String CUSTOMER = "customer"; - private static final String NATION = "nation"; - private static final String REGION = "region"; - private static final String SUPPLIER = "supplier"; - - private static HCatalogStore store; - private static Path warehousePath; - - @BeforeClass - public static void setUp() throws Exception { - Path testPath = CommonTestingUtil.getTestDir(); - warehousePath = new Path(testPath, "warehouse"); - - //create local hiveMeta - HiveConf conf = new HiveConf(); - String jdbcUri = "jdbc:derby:;databaseName="+testPath.toUri().getPath()+"metastore_db;create=true"; - conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, warehousePath.toUri().toString()); - conf.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, jdbcUri); - conf.set(TajoConf.ConfVars.WAREHOUSE_DIR.varname, warehousePath.toUri().toString()); - - // create local HCatalogStore. - TajoConf tajoConf = new TajoConf(conf); - store = new HCatalogStore(tajoConf); - store.createDatabase(DB_NAME, null); - } - - @AfterClass - public static void tearDown() throws IOException { - store.close(); - } - - @Test - public void testTableUsingTextFile() throws Exception { - TableMeta meta = new TableMeta(CatalogProtos.StoreType.CSV, new KeyValueSet()); - - org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); - schema.addColumn("c_custkey", TajoDataTypes.Type.INT4); - schema.addColumn("c_name", TajoDataTypes.Type.TEXT); - schema.addColumn("c_address", TajoDataTypes.Type.TEXT); - schema.addColumn("c_nationkey", TajoDataTypes.Type.INT4); - schema.addColumn("c_phone", TajoDataTypes.Type.TEXT); - schema.addColumn("c_acctbal", TajoDataTypes.Type.FLOAT8); - schema.addColumn("c_mktsegment", TajoDataTypes.Type.TEXT); - schema.addColumn("c_comment", TajoDataTypes.Type.TEXT); - - TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, CUSTOMER), schema, meta, - new Path(warehousePath, new Path(DB_NAME, CUSTOMER)).toUri()); - store.createTable(table.getProto()); - assertTrue(store.existTable(DB_NAME, CUSTOMER)); - - TableDesc table1 = new TableDesc(store.getTable(DB_NAME, CUSTOMER)); - assertEquals(table.getName(), table1.getName()); - assertEquals(table.getPath(), table1.getPath()); - assertEquals(table.getSchema().size(), table1.getSchema().size()); - for (int i = 0; i < table.getSchema().size(); i++) { - assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); - } - - assertEquals(StringEscapeUtils.escapeJava(StorageConstants.DEFAULT_FIELD_DELIMITER), - table1.getMeta().getOption(StorageConstants.TEXT_DELIMITER)); - store.dropTable(DB_NAME, CUSTOMER); - } - - @Test - public void testTableUsingRCFileWithBinarySerde() throws Exception { - KeyValueSet options = new KeyValueSet(); - options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE); - TableMeta meta = new TableMeta(CatalogProtos.StoreType.RCFILE, options); - - org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); - schema.addColumn("r_regionkey", TajoDataTypes.Type.INT4); - schema.addColumn("r_name", TajoDataTypes.Type.TEXT); - schema.addColumn("r_comment", TajoDataTypes.Type.TEXT); - - TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, REGION), schema, meta, - new Path(warehousePath, new Path(DB_NAME, REGION)).toUri()); - store.createTable(table.getProto()); - assertTrue(store.existTable(DB_NAME, REGION)); - - TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION)); - assertEquals(table.getName(), table1.getName()); - assertEquals(table.getPath(), table1.getPath()); - assertEquals(table.getSchema().size(), table1.getSchema().size()); - for (int i = 0; i < table.getSchema().size(); i++) { - assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); - } - - assertEquals(StorageConstants.DEFAULT_BINARY_SERDE, - table1.getMeta().getOption(StorageConstants.RCFILE_SERDE)); - store.dropTable(DB_NAME, REGION); - } - - @Test - public void testTableUsingRCFileWithTextSerde() throws Exception { - KeyValueSet options = new KeyValueSet(); - options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE); - TableMeta meta = new TableMeta(CatalogProtos.StoreType.RCFILE, options); - - org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); - schema.addColumn("r_regionkey", TajoDataTypes.Type.INT4); - schema.addColumn("r_name", TajoDataTypes.Type.TEXT); - schema.addColumn("r_comment", TajoDataTypes.Type.TEXT); - - TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, REGION), schema, meta, - new Path(warehousePath, new Path(DB_NAME, REGION)).toUri()); - store.createTable(table.getProto()); - assertTrue(store.existTable(DB_NAME, REGION)); - - TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION)); - assertEquals(table.getName(), table1.getName()); - assertEquals(table.getPath(), table1.getPath()); - assertEquals(table.getSchema().size(), table1.getSchema().size()); - for (int i = 0; i < table.getSchema().size(); i++) { - assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); - } - - assertEquals(StorageConstants.DEFAULT_TEXT_SERDE, table1.getMeta().getOption(StorageConstants.RCFILE_SERDE)); - store.dropTable(DB_NAME, REGION); - } - - @Test - public void testTableWithNullValue() throws Exception { - KeyValueSet options = new KeyValueSet(); - options.set(StorageConstants.TEXT_DELIMITER, StringEscapeUtils.escapeJava("\u0002")); - options.set(StorageConstants.TEXT_NULL, StringEscapeUtils.escapeJava("\u0003")); - TableMeta meta = new TableMeta(CatalogProtos.StoreType.CSV, options); - - org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); - schema.addColumn("s_suppkey", TajoDataTypes.Type.INT4); - schema.addColumn("s_name", TajoDataTypes.Type.TEXT); - schema.addColumn("s_address", TajoDataTypes.Type.TEXT); - schema.addColumn("s_nationkey", TajoDataTypes.Type.INT4); - schema.addColumn("s_phone", TajoDataTypes.Type.TEXT); - schema.addColumn("s_acctbal", TajoDataTypes.Type.FLOAT8); - schema.addColumn("s_comment", TajoDataTypes.Type.TEXT); - - TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, SUPPLIER), schema, meta, - new Path(warehousePath, new Path(DB_NAME, SUPPLIER)).toUri()); - - store.createTable(table.getProto()); - assertTrue(store.existTable(DB_NAME, SUPPLIER)); - - TableDesc table1 = new TableDesc(store.getTable(DB_NAME, SUPPLIER)); - assertEquals(table.getName(), table1.getName()); - assertEquals(table.getPath(), table1.getPath()); - assertEquals(table.getSchema().size(), table1.getSchema().size()); - for (int i = 0; i < table.getSchema().size(); i++) { - assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); - } - - assertEquals(table.getMeta().getOption(StorageConstants.TEXT_DELIMITER), - table1.getMeta().getOption(StorageConstants.TEXT_DELIMITER)); - - assertEquals(table.getMeta().getOption(StorageConstants.TEXT_NULL), - table1.getMeta().getOption(StorageConstants.TEXT_NULL)); - - assertEquals(table1.getMeta().getOption(StorageConstants.TEXT_DELIMITER), - StringEscapeUtils.escapeJava("\u0002")); - - assertEquals(table1.getMeta().getOption(StorageConstants.TEXT_NULL), - StringEscapeUtils.escapeJava("\u0003")); - - store.dropTable(DB_NAME, SUPPLIER); - - } - - @Test - public void testAddTableByPartition() throws Exception { - TableMeta meta = new TableMeta(CatalogProtos.StoreType.CSV, new KeyValueSet()); - - org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); - schema.addColumn("n_name", TajoDataTypes.Type.TEXT); - schema.addColumn("n_regionkey", TajoDataTypes.Type.INT4); - schema.addColumn("n_comment", TajoDataTypes.Type.TEXT); - - - TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, NATION), schema, meta, - new Path(warehousePath, new Path(DB_NAME, NATION)).toUri()); - - org.apache.tajo.catalog.Schema expressionSchema = new org.apache.tajo.catalog.Schema(); - expressionSchema.addColumn("n_nationkey", TajoDataTypes.Type.INT4); - - PartitionMethodDesc partitions = new PartitionMethodDesc( - DB_NAME, - NATION, - CatalogProtos.PartitionType.COLUMN, expressionSchema.getColumn(0).getQualifiedName(), expressionSchema); - table.setPartitionMethod(partitions); - - store.createTable(table.getProto()); - assertTrue(store.existTable(DB_NAME, NATION)); - - TableDesc table1 = new TableDesc(store.getTable(DB_NAME, NATION)); - assertEquals(table.getName(), table1.getName()); - assertEquals(table.getPath(), table1.getPath()); - assertEquals(table.getSchema().size(), table1.getSchema().size()); - for (int i = 0; i < table.getSchema().size(); i++) { - assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); - } - - - Schema partitionSchema = table.getPartitionMethod().getExpressionSchema(); - Schema partitionSchema1 = table1.getPartitionMethod().getExpressionSchema(); - assertEquals(partitionSchema.size(), partitionSchema1.size()); - for (int i = 0; i < partitionSchema.size(); i++) { - assertEquals(partitionSchema.getColumn(i).getSimpleName(), partitionSchema1.getColumn(i).getSimpleName()); - } - - store.dropTable(DB_NAME, NATION); - } - - - @Test - public void testGetAllTableNames() throws Exception{ - TableMeta meta = new TableMeta(CatalogProtos.StoreType.CSV, new KeyValueSet()); - org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); - schema.addColumn("n_name", TajoDataTypes.Type.TEXT); - schema.addColumn("n_regionkey", TajoDataTypes.Type.INT4); - schema.addColumn("n_comment", TajoDataTypes.Type.TEXT); - - String[] tableNames = new String[]{"table1", "table2", "table3"}; - - for(String tableName : tableNames){ - TableDesc table = new TableDesc(CatalogUtil.buildFQName("default", tableName), schema, meta, - new Path(warehousePath, new Path(DB_NAME, tableName)).toUri()); - store.createTable(table.getProto()); - } - - List tables = store.getAllTableNames("default"); - assertEquals(tableNames.length, tables.size()); - - for(String tableName : tableNames){ - assertTrue(tables.contains(tableName)); - } - - for(String tableName : tableNames){ - store.dropTable("default", tableName); - } - } - - @Test - public void testDeleteTable() throws Exception { - TableMeta meta = new TableMeta(CatalogProtos.StoreType.CSV, new KeyValueSet()); - org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); - schema.addColumn("n_name", TajoDataTypes.Type.TEXT); - schema.addColumn("n_regionkey", TajoDataTypes.Type.INT4); - schema.addColumn("n_comment", TajoDataTypes.Type.TEXT); - - String tableName = "table1"; - TableDesc table = new TableDesc(DB_NAME + "." + tableName, schema, meta, warehousePath.toUri()); - store.createTable(table.getProto()); - assertTrue(store.existTable(DB_NAME, tableName)); - - TableDesc table1 = new TableDesc(store.getTable(DB_NAME, tableName)); - FileSystem fs = FileSystem.getLocal(new Configuration()); - assertTrue(fs.exists(new Path(table1.getPath()))); - - store.dropTable(DB_NAME, tableName); - assertFalse(store.existTable(DB_NAME, tableName)); - fs.close(); - } - - @Test - public void testTableUsingSequenceFileWithBinarySerde() throws Exception { - KeyValueSet options = new KeyValueSet(); - options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE); - TableMeta meta = new TableMeta(CatalogProtos.StoreType.SEQUENCEFILE, options); - - org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); - schema.addColumn("r_regionkey", TajoDataTypes.Type.INT4); - schema.addColumn("r_name", TajoDataTypes.Type.TEXT); - schema.addColumn("r_comment", TajoDataTypes.Type.TEXT); - - TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, REGION), schema, meta, - new Path(warehousePath, new Path(DB_NAME, REGION)).toUri()); - store.createTable(table.getProto()); - assertTrue(store.existTable(DB_NAME, REGION)); - - TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION)); - assertEquals(table.getName(), table1.getName()); - assertEquals(table.getPath(), table1.getPath()); - assertEquals(table.getSchema().size(), table1.getSchema().size()); - for (int i = 0; i < table.getSchema().size(); i++) { - assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); - } - - assertEquals(StorageConstants.DEFAULT_BINARY_SERDE, - table1.getMeta().getOption(StorageConstants.SEQUENCEFILE_SERDE)); - store.dropTable(DB_NAME, REGION); - } - - @Test - public void testTableUsingSequenceFileWithTextSerde() throws Exception { - KeyValueSet options = new KeyValueSet(); - options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE); - TableMeta meta = new TableMeta(CatalogProtos.StoreType.SEQUENCEFILE, options); - - org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); - schema.addColumn("r_regionkey", TajoDataTypes.Type.INT4); - schema.addColumn("r_name", TajoDataTypes.Type.TEXT); - schema.addColumn("r_comment", TajoDataTypes.Type.TEXT); - - TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, REGION), schema, meta, - new Path(warehousePath, new Path(DB_NAME, REGION)).toUri()); - store.createTable(table.getProto()); - assertTrue(store.existTable(DB_NAME, REGION)); - - TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION)); - assertEquals(table.getName(), table1.getName()); - assertEquals(table.getPath(), table1.getPath()); - assertEquals(table.getSchema().size(), table1.getSchema().size()); - for (int i = 0; i < table.getSchema().size(); i++) { - assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); - } - - assertEquals(StorageConstants.DEFAULT_TEXT_SERDE, table1.getMeta().getOption(StorageConstants.SEQUENCEFILE_SERDE)); - store.dropTable(DB_NAME, REGION); - } - - - @Test - public void testTableUsingParquet() throws Exception { - TableMeta meta = new TableMeta(CatalogProtos.StoreType.PARQUET, new KeyValueSet()); - - org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); - schema.addColumn("c_custkey", TajoDataTypes.Type.INT4); - schema.addColumn("c_name", TajoDataTypes.Type.TEXT); - schema.addColumn("c_address", TajoDataTypes.Type.TEXT); - schema.addColumn("c_nationkey", TajoDataTypes.Type.INT4); - schema.addColumn("c_phone", TajoDataTypes.Type.TEXT); - schema.addColumn("c_acctbal", TajoDataTypes.Type.FLOAT8); - schema.addColumn("c_mktsegment", TajoDataTypes.Type.TEXT); - schema.addColumn("c_comment", TajoDataTypes.Type.TEXT); - - TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, CUSTOMER), schema, meta, - new Path(warehousePath, new Path(DB_NAME, CUSTOMER)).toUri()); - store.createTable(table.getProto()); - assertTrue(store.existTable(DB_NAME, CUSTOMER)); - - TableDesc table1 = new TableDesc(store.getTable(DB_NAME, CUSTOMER)); - assertEquals(table.getName(), table1.getName()); - assertEquals(table.getPath(), table1.getPath()); - assertEquals(table.getSchema().size(), table1.getSchema().size()); - for (int i = 0; i < table.getSchema().size(); i++) { - assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); - } - - store.dropTable(DB_NAME, CUSTOMER); - } -} diff --git a/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/commands/DescFunctionCommand.java b/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/commands/DescFunctionCommand.java index 295d326d13..abdbb9c6a4 100644 --- a/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/commands/DescFunctionCommand.java +++ b/tajo-cli/src/main/java/org/apache/tajo/cli/tsql/commands/DescFunctionCommand.java @@ -52,17 +52,7 @@ public void invoke(String[] cmd) throws Exception { List functions = new ArrayList(client.getFunctions(functionName)); - Collections.sort(functions, new Comparator() { - @Override - public int compare(CatalogProtos.FunctionDescProto f1, CatalogProtos.FunctionDescProto f2) { - int nameCompared = f1.getSignature().getName().compareTo(f2.getSignature().getName()); - if (nameCompared != 0) { - return nameCompared; - } else { - return f1.getSignature().getReturnType().getType().compareTo(f2.getSignature().getReturnType().getType()); - } - } - }); + Collections.sort(functions, new FunctionUtil.FunctionDescProtoComparator()); String[] headers = new String[]{"Name", "Result type", "Argument types", "Description", "Type"}; float[] columnWidthRates = new float[]{0.15f, 0.15f, 0.2f, 0.4f, 0.1f}; @@ -70,12 +60,12 @@ public int compare(CatalogProtos.FunctionDescProto f1, CatalogProtos.FunctionDes for(CatalogProtos.FunctionDescProto eachFunction: functions) { String name = eachFunction.getSignature().getName(); - String resultDataType = eachFunction.getSignature().getReturnType().getType().toString(); + String resultDataType = eachFunction.getSignature().getReturnType().getType().toString().toLowerCase(); String arguments = FunctionUtil.buildParamTypeString( eachFunction.getSignature().getParameterTypesList().toArray( new DataType[eachFunction.getSignature().getParameterTypesCount()])); - String functionType = eachFunction.getSignature().getType().toString(); - String description = eachFunction.getSupplement().getShortDescription(); + String functionType = eachFunction.getSignature().getType().toString().toLowerCase(); + String description = eachFunction.getSupplement().getShortDescription().trim(); int index = 0; printLeft(" " + name, columnWidths[index++]); diff --git a/tajo-core/src/main/java/org/apache/tajo/util/JSPUtil.java b/tajo-core/src/main/java/org/apache/tajo/util/JSPUtil.java index 875d12bab7..aee2ced973 100644 --- a/tajo-core/src/main/java/org/apache/tajo/util/JSPUtil.java +++ b/tajo-core/src/main/java/org/apache/tajo/util/JSPUtil.java @@ -311,20 +311,6 @@ static int compareLong(long a, long b) { } } - public static void sortFunctionDesc(List functions) { - Collections.sort(functions, new java.util.Comparator() { - @Override - public int compare(FunctionDesc f1, FunctionDesc f2) { - int nameCompared = f1.getFunctionName().compareTo(f2.getFunctionName()); - if(nameCompared != 0) { - return nameCompared; - } else { - return f1.getReturnType().getType().compareTo(f2.getReturnType().getType()); - } - } - }); - } - static final DecimalFormat PERCENT_FORMAT = new DecimalFormat("###.#"); public static String percentFormat(float value) { return PERCENT_FORMAT.format(value * 100.0f); diff --git a/tajo-core/src/main/resources/webapps/admin/functions.jsp b/tajo-core/src/main/resources/webapps/admin/functions.jsp index c805aaa1b4..cf3ddc5a9f 100644 --- a/tajo-core/src/main/resources/webapps/admin/functions.jsp +++ b/tajo-core/src/main/resources/webapps/admin/functions.jsp @@ -24,13 +24,12 @@ <%@ page import="org.apache.tajo.master.*" %> <%@ page import="org.apache.tajo.catalog.*" %> <%@ page import="org.apache.hadoop.http.HtmlQuoting" %> -<%@ page import="org.apache.tajo.util.JSPUtil" %> <% TajoMaster master = (TajoMaster) StaticHttpServer.getInstance().getAttribute("tajo.info.server.object"); CatalogService catalog = master.getCatalog(); List functions = new ArrayList(catalog.getFunctions()); - JSPUtil.sortFunctionDesc(functions); + Collections.sort(functions); %> From 74d8d80242bf5e57e8f25605e5a0e91ba5a78915 Mon Sep 17 00:00:00 2001 From: DaeMyung Kang Date: Wed, 13 May 2015 00:07:45 +0900 Subject: [PATCH 032/141] TAJO-1598: TableMeta should change equals mechanism. Signed-off-by: Jihoon Son --- CHANGES | 3 +++ .../org/apache/tajo/catalog/TableMeta.java | 6 +++-- .../apache/tajo/catalog/TestTableMeta.java | 27 +++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/CHANGES b/CHANGES index 05ea1f2758..c64309a303 100644 --- a/CHANGES +++ b/CHANGES @@ -39,6 +39,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1598: TableMeta should change equals mechanism. + (Contributed by DaeMyung Kang, Committed by jihoon) + TAJO-1556: "insert into select" with reordered column list does not work. (Contributed by Yongjin Choi, Committed by jihoon) diff --git a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TableMeta.java b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TableMeta.java index 2d95e6be09..e14c1f5480 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TableMeta.java +++ b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/TableMeta.java @@ -136,8 +136,10 @@ public Map toMap() { public boolean equals(Object object) { if(object instanceof TableMeta) { TableMeta other = (TableMeta) object; - - return this.getProto().equals(other.getProto()); + + boolean eq = this.getStoreType().equals(other.getStoreType()); + eq = eq && this.getOptions().equals(other.getOptions()); + return eq; } return false; diff --git a/tajo-catalog/tajo-catalog-common/src/test/java/org/apache/tajo/catalog/TestTableMeta.java b/tajo-catalog/tajo-catalog-common/src/test/java/org/apache/tajo/catalog/TestTableMeta.java index 904b4bc6ee..3d6b39094e 100644 --- a/tajo-catalog/tajo-catalog-common/src/test/java/org/apache/tajo/catalog/TestTableMeta.java +++ b/tajo-catalog/tajo-catalog-common/src/test/java/org/apache/tajo/catalog/TestTableMeta.java @@ -22,6 +22,7 @@ import org.apache.tajo.catalog.proto.CatalogProtos.StoreType; import org.apache.tajo.catalog.proto.CatalogProtos.TableProto; import org.apache.tajo.common.TajoDataTypes.Type; +import org.apache.tajo.rpc.protocolrecords.PrimitiveProtos; import org.junit.Before; import org.junit.Test; @@ -86,6 +87,32 @@ public void testEqualsObject() { assertTrue(meta.equals(meta2)); assertNotSame(meta, meta2); } + + @Test + public void testEqualsObject2() { + //This testcases should insert more 2 items into one slot. + //HashMap's default slot count is 16 + //so max_count is 17 + + int MAX_COUNT = 17; + + TableMeta meta1 = CatalogUtil.newTableMeta(StoreType.CSV.toString()); + for (int i = 0; i < MAX_COUNT; i++) { + meta1.putOption("key"+i, "value"+i); + } + + PrimitiveProtos.KeyValueSetProto.Builder optionBuilder = PrimitiveProtos.KeyValueSetProto.newBuilder(); + for (int i = 1; i <= MAX_COUNT; i++) { + PrimitiveProtos.KeyValueProto.Builder keyValueBuilder = PrimitiveProtos.KeyValueProto.newBuilder(); + keyValueBuilder.setKey("key"+(MAX_COUNT-i)).setValue("value"+(MAX_COUNT-i)); + optionBuilder.addKeyval(keyValueBuilder); + } + TableProto.Builder builder = TableProto.newBuilder(); + builder.setStoreType(StoreType.CSV.toString()); + builder.setParams(optionBuilder); + TableMeta meta2 = new TableMeta(builder.build()); + assertTrue(meta1.equals(meta2)); + } @Test public void testGetProto() { From effc35398b8441121b3c5372e792634e532f34a2 Mon Sep 17 00:00:00 2001 From: DaeMyung Kang Date: Thu, 14 May 2015 16:44:16 +0900 Subject: [PATCH 033/141] TAJO-1485: Datum 'Char' returned only 1byte. Closes #503 Signed-off-by: Jihoon Son --- CHANGES | 3 ++ .../org/apache/tajo/storage/RowStoreUtil.java | 24 ++++++++-- ...alueTooLongForTypeCharactersException.java | 27 +++++++++++ .../tajo/engine/query/TestInsertQuery.java | 45 ++++++++++++++++- .../tajo/engine/util/TestTupleUtil.java | 18 +++++++ .../queries/TestInsertQuery/test1_ddl.sql | 1 + .../TestInsertQuery/test1_nolength_ddl.sql | 1 + .../testInsertIntoSelectWithFixedSizeChar.sql | 4 ++ ...ntoSelectWithFixedSizeCharWithNoLength.sql | 2 + .../org/apache/tajo/plan/LogicalPlanner.java | 4 ++ .../storage/BinarySerializerDeserializer.java | 10 ++++ .../org/apache/tajo/storage/RowStoreUtil.java | 20 ++++++-- .../storage/TextSerializerDeserializer.java | 10 ++-- .../storage/parquet/TajoWriteSupport.java | 7 +++ .../text/TextFieldSerializerDeserializer.java | 8 +++- .../org/apache/tajo/storage/TestStorages.java | 48 ++++++++++++++++++- 16 files changed, 218 insertions(+), 14 deletions(-) create mode 100644 tajo-common/src/main/java/org/apache/tajo/exception/ValueTooLongForTypeCharactersException.java create mode 100644 tajo-core/src/test/resources/queries/TestInsertQuery/test1_ddl.sql create mode 100644 tajo-core/src/test/resources/queries/TestInsertQuery/test1_nolength_ddl.sql create mode 100644 tajo-core/src/test/resources/queries/TestInsertQuery/testInsertIntoSelectWithFixedSizeChar.sql create mode 100644 tajo-core/src/test/resources/queries/TestInsertQuery/testInsertIntoSelectWithFixedSizeCharWithNoLength.sql diff --git a/CHANGES b/CHANGES index c64309a303..4f718a963a 100644 --- a/CHANGES +++ b/CHANGES @@ -39,6 +39,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1485: Datum 'Char' returned only 1byte. + (Contributed by DaeMyung Kang, Committed by jihoon) + TAJO-1598: TableMeta should change equals mechanism. (Contributed by DaeMyung Kang, Committed by jihoon) diff --git a/tajo-client/src/main/java/org/apache/tajo/storage/RowStoreUtil.java b/tajo-client/src/main/java/org/apache/tajo/storage/RowStoreUtil.java index 385f99cb5a..eef1c2697f 100644 --- a/tajo-client/src/main/java/org/apache/tajo/storage/RowStoreUtil.java +++ b/tajo-client/src/main/java/org/apache/tajo/storage/RowStoreUtil.java @@ -25,6 +25,7 @@ import org.apache.tajo.datum.IntervalDatum; import org.apache.tajo.exception.UnknownDataTypeException; import org.apache.tajo.exception.UnsupportedException; +import org.apache.tajo.exception.ValueTooLongForTypeCharactersException; import org.apache.tajo.util.BitArray; import java.nio.ByteBuffer; @@ -100,8 +101,9 @@ public Tuple toTuple(byte [] bytes) { break; case CHAR: - byte c = bb.get(); - tuple.put(i, DatumFactory.createChar(c)); + byte [] _str = new byte[type.getLength()]; + bb.get(_str); + tuple.put(i, DatumFactory.createChar(_str)); break; case INT2: @@ -197,7 +199,17 @@ private RowStoreEncoder(Schema schema) { case NULL_TYPE: nullFlags.set(i); break; case BOOLEAN: bb.put(tuple.get(i).asByte()); break; case BIT: bb.put(tuple.get(i).asByte()); break; - case CHAR: bb.put(tuple.get(i).asByte()); break; + case CHAR: + int charSize = col.getDataType().getLength(); + byte [] _char = new byte[charSize]; + byte [] src = tuple.get(i).asByteArray(); + if (charSize < src.length) { + throw new ValueTooLongForTypeCharactersException(charSize); + } + + System.arraycopy(src, 0, _char, 0, src.length); + bb.put(_char); + break; case INT2: bb.putShort(tuple.get(i).asInt2()); break; case INT4: bb.putInt(tuple.get(i).asInt4()); break; case INT8: bb.putLong(tuple.get(i).asInt8()); break; @@ -259,7 +271,11 @@ private int estimateTupleDataSize(Tuple tuple) { switch (col.getDataType().getType()) { case BOOLEAN: case BIT: - case CHAR: size += 1; break; + size += 1; + break; + case CHAR: + size += col.getDataType().getLength(); + break; case INT2: size += 2; break; case DATE: case INT4: diff --git a/tajo-common/src/main/java/org/apache/tajo/exception/ValueTooLongForTypeCharactersException.java b/tajo-common/src/main/java/org/apache/tajo/exception/ValueTooLongForTypeCharactersException.java new file mode 100644 index 0000000000..262b71402f --- /dev/null +++ b/tajo-common/src/main/java/org/apache/tajo/exception/ValueTooLongForTypeCharactersException.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.exception; + +public class ValueTooLongForTypeCharactersException extends RuntimeException { + private static final long serialVersionUID = -7689027447969916150L; + + public ValueTooLongForTypeCharactersException(int size) { + super("value too long for type character(" + size + ")"); + } +} diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestInsertQuery.java b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestInsertQuery.java index 72cbf871b6..4c09acde13 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/query/TestInsertQuery.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/query/TestInsertQuery.java @@ -39,7 +39,6 @@ import java.util.List; import static org.junit.Assert.*; -import static org.junit.Assert.assertEquals; @Category(IntegrationTest.class) public class TestInsertQuery extends QueryTestCaseBase { @@ -836,4 +835,48 @@ public final void testInsertWithDifferentColumnOrder() throws Exception { executeString("drop table nation_diff purge;"); } } + + @Test + public final void testFixedCharSelectWithNoLength() throws Exception { + ResultSet res = executeFile("test1_nolength_ddl.sql"); + res.close(); + + CatalogService catalog = testingCluster.getMaster().getCatalog(); + assertTrue(catalog.existsTable(getCurrentDatabase(), "test1")); + + res = executeFile("testInsertIntoSelectWithFixedSizeCharWithNoLength.sql"); + res.close(); + + //remove \0 + String resultDatas = getTableFileContents("test1").replaceAll("\0",""); + String expected = "a\n"; + + assertNotNull(resultDatas); + assertEquals(expected.length(), resultDatas.length()); + assertEquals(expected, resultDatas); + executeString("DROP TABLE test1 PURGE"); + } + + @Test + public final void testFixedCharSelect() throws Exception { + ResultSet res = executeFile("test1_ddl.sql"); + res.close(); + + CatalogService catalog = testingCluster.getMaster().getCatalog(); + assertTrue(catalog.existsTable(getCurrentDatabase(), "test1")); + + res = executeFile("testInsertIntoSelectWithFixedSizeChar.sql"); + res.close(); + + //remove \0 + String resultDatas = getTableFileContents("test1").replaceAll("\0",""); + String expected = "a\n" + + "abc\n" + + "abcde\n"; + + assertNotNull(resultDatas); + assertEquals(expected.length(), resultDatas.length()); + assertEquals(expected, resultDatas); + executeString("DROP TABLE test1 PURGE"); + } } diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/util/TestTupleUtil.java b/tajo-core/src/test/java/org/apache/tajo/engine/util/TestTupleUtil.java index b8114e0550..c1c07b8251 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/util/TestTupleUtil.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/util/TestTupleUtil.java @@ -36,6 +36,24 @@ import static org.junit.Assert.*; public class TestTupleUtil { + @Test + public final void testFixedSizeChar() { + Schema schema = new Schema(); + schema.addColumn("col1", Type.CHAR, 5); + + Tuple tuple = new VTuple(1); + tuple.put(new Datum[] { + DatumFactory.createChar("abc\0\0") + }); + + RowStoreEncoder encoder = RowStoreUtil.createEncoder(schema); + RowStoreDecoder decoder = RowStoreUtil.createDecoder(schema); + byte [] bytes = encoder.toBytes(tuple); + Tuple tuple2 = decoder.toTuple(bytes); + + assertEquals(tuple, tuple2); + } + @Test public final void testToBytesAndToTuple() { Schema schema = new Schema(); diff --git a/tajo-core/src/test/resources/queries/TestInsertQuery/test1_ddl.sql b/tajo-core/src/test/resources/queries/TestInsertQuery/test1_ddl.sql new file mode 100644 index 0000000000..c02b080df6 --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestInsertQuery/test1_ddl.sql @@ -0,0 +1 @@ +create table test1 (col1 char(5)); \ No newline at end of file diff --git a/tajo-core/src/test/resources/queries/TestInsertQuery/test1_nolength_ddl.sql b/tajo-core/src/test/resources/queries/TestInsertQuery/test1_nolength_ddl.sql new file mode 100644 index 0000000000..cbe3654d64 --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestInsertQuery/test1_nolength_ddl.sql @@ -0,0 +1 @@ +create table test1 (col1 char); \ No newline at end of file diff --git a/tajo-core/src/test/resources/queries/TestInsertQuery/testInsertIntoSelectWithFixedSizeChar.sql b/tajo-core/src/test/resources/queries/TestInsertQuery/testInsertIntoSelectWithFixedSizeChar.sql new file mode 100644 index 0000000000..f7ec11cfbd --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestInsertQuery/testInsertIntoSelectWithFixedSizeChar.sql @@ -0,0 +1,4 @@ +insert into test1 select 'a'; +insert into test1 select 'abc'; +insert into test1 select 'abcde'; +select * from test1; \ No newline at end of file diff --git a/tajo-core/src/test/resources/queries/TestInsertQuery/testInsertIntoSelectWithFixedSizeCharWithNoLength.sql b/tajo-core/src/test/resources/queries/TestInsertQuery/testInsertIntoSelectWithFixedSizeCharWithNoLength.sql new file mode 100644 index 0000000000..02a1d6c45d --- /dev/null +++ b/tajo-core/src/test/resources/queries/TestInsertQuery/testInsertIntoSelectWithFixedSizeCharWithNoLength.sql @@ -0,0 +1,2 @@ +insert into test1 select 'a'; +select * from test1; \ No newline at end of file diff --git a/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java b/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java index a2621009b9..27cd3b0bd1 100644 --- a/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java +++ b/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java @@ -1882,6 +1882,10 @@ public static TajoDataTypes.DataType convertDataType(DataTypeExpr dataType) { builder.setType(type); if (dataType.hasLengthOrPrecision()) { builder.setLength(dataType.getLengthOrPrecision()); + } else { + if (type == TajoDataTypes.Type.CHAR) { + builder.setLength(1); + } } return builder.build(); } diff --git a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BinarySerializerDeserializer.java b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BinarySerializerDeserializer.java index 00112e7659..a3b8da8a2c 100644 --- a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BinarySerializerDeserializer.java +++ b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BinarySerializerDeserializer.java @@ -22,6 +22,7 @@ import com.google.protobuf.Message; import org.apache.tajo.catalog.Column; import org.apache.tajo.datum.*; +import org.apache.tajo.exception.ValueTooLongForTypeCharactersException; import org.apache.tajo.util.Bytes; import java.io.IOException; @@ -44,9 +45,18 @@ public int serialize(Column col, Datum datum, OutputStream out, byte[] nullChara switch (col.getDataType().getType()) { case BOOLEAN: case BIT: + bytes = datum.asByteArray(); + length = bytes.length; + out.write(bytes, 0, length); + break; + case CHAR: bytes = datum.asByteArray(); length = bytes.length; + if (length > col.getDataType().getLength()) { + throw new ValueTooLongForTypeCharactersException(col.getDataType().getLength()); + } + out.write(bytes, 0, length); break; case INT2: diff --git a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/RowStoreUtil.java b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/RowStoreUtil.java index 33db7982ac..ad43f7b6ca 100644 --- a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/RowStoreUtil.java +++ b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/RowStoreUtil.java @@ -26,6 +26,7 @@ import org.apache.tajo.datum.ProtobufDatum; import org.apache.tajo.exception.UnknownDataTypeException; import org.apache.tajo.exception.UnsupportedException; +import org.apache.tajo.exception.ValueTooLongForTypeCharactersException; import org.apache.tajo.tuple.offheap.RowWriter; import org.apache.tajo.util.BitArray; @@ -99,8 +100,9 @@ public Tuple toTuple(byte [] bytes) { break; case CHAR: - byte c = bb.get(); - tuple.put(i, DatumFactory.createChar(c)); + byte [] _str = new byte[type.getLength()]; + bb.get(_str); + tuple.put(i, DatumFactory.createChar(_str)); break; case INT2: @@ -204,7 +206,15 @@ public byte[] toBytes(Tuple tuple) { bb.put(tuple.get(i).asByte()); break; case CHAR: - bb.put(tuple.get(i).asByte()); + int charSize = col.getDataType().getLength(); + byte [] _char = new byte[charSize]; + byte [] src = tuple.get(i).asByteArray(); + if (charSize < src.length) { + throw new ValueTooLongForTypeCharactersException(charSize); + } + + System.arraycopy(src, 0, _char, 0, src.length); + bb.put(_char); break; case INT2: bb.putShort(tuple.get(i).asInt2()); @@ -281,9 +291,11 @@ private int estimateTupleDataSize(Tuple tuple) { switch (col.getDataType().getType()) { case BOOLEAN: case BIT: - case CHAR: size += 1; break; + case CHAR: + size += col.getDataType().getLength(); + break; case INT2: size += 2; break; diff --git a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java index ab8816bc28..954b62d873 100644 --- a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java +++ b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/TextSerializerDeserializer.java @@ -20,12 +20,11 @@ import com.google.protobuf.Message; import org.apache.commons.codec.binary.Base64; -import org.apache.tajo.TajoConstants; import org.apache.tajo.catalog.Column; import org.apache.tajo.common.TajoDataTypes; -import org.apache.tajo.conf.TajoConf; import org.apache.tajo.datum.*; import org.apache.tajo.datum.protobuf.ProtobufJsonFormat; +import org.apache.tajo.exception.ValueTooLongForTypeCharactersException; import org.apache.tajo.util.Bytes; import org.apache.tajo.util.NumberUtil; @@ -66,7 +65,12 @@ public int serialize(Column col, Datum datum, OutputStream out, byte[] nullChara length = trueBytes.length; break; case CHAR: - byte[] pad = new byte[dataType.getLength() - datum.size()]; + int size = dataType.getLength() - datum.size(); + if (size < 0){ + throw new ValueTooLongForTypeCharactersException(dataType.getLength()); + } + + byte[] pad = new byte[size]; bytes = datum.asTextBytes(); out.write(bytes); out.write(pad); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/parquet/TajoWriteSupport.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/parquet/TajoWriteSupport.java index e05aeafdda..dd951e1ffc 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/parquet/TajoWriteSupport.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/parquet/TajoWriteSupport.java @@ -23,6 +23,7 @@ import org.apache.tajo.catalog.Schema; import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.datum.Datum; +import org.apache.tajo.exception.ValueTooLongForTypeCharactersException; import org.apache.tajo.storage.Tuple; import parquet.hadoop.api.WriteSupport; import parquet.io.api.Binary; @@ -132,6 +133,12 @@ private void writeValue(Type fieldType, Column column, Datum datum) { recordConsumer.addDouble(datum.asFloat8()); break; case CHAR: + if (datum.size() > column.getDataType().getLength()) { + throw new ValueTooLongForTypeCharactersException(column.getDataType().getLength()); + } + + recordConsumer.addBinary(Binary.fromByteArray(datum.asTextBytes())); + break; case TEXT: recordConsumer.addBinary(Binary.fromByteArray(datum.asTextBytes())); break; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java index e637c7f0cb..d2eee9f538 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java @@ -28,6 +28,7 @@ import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.datum.*; import org.apache.tajo.datum.protobuf.ProtobufJsonFormat; +import org.apache.tajo.exception.ValueTooLongForTypeCharactersException; import org.apache.tajo.storage.FieldSerializerDeserializer; import org.apache.tajo.storage.StorageConstants; import org.apache.tajo.util.Bytes; @@ -86,7 +87,12 @@ public int serialize(OutputStream out, Datum datum, Column col, int columnIndex, length = trueBytes.length; break; case CHAR: - byte[] pad = new byte[dataType.getLength() - datum.size()]; + int size = dataType.getLength() - datum.size(); + if (size < 0){ + throw new ValueTooLongForTypeCharactersException(dataType.getLength()); + } + + byte[] pad = new byte[size]; bytes = datum.asTextBytes(); out.write(bytes); out.write(pad); diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java index b73fb5baf8..6a7cd94bf1 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java @@ -30,7 +30,6 @@ import org.apache.tajo.catalog.CatalogUtil; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; -import org.apache.tajo.catalog.proto.CatalogProtos.StoreType; import org.apache.tajo.catalog.statistics.TableStats; import org.apache.tajo.common.TajoDataTypes.Type; import org.apache.tajo.conf.TajoConf; @@ -38,6 +37,7 @@ import org.apache.tajo.datum.DatumFactory; import org.apache.tajo.datum.NullDatum; import org.apache.tajo.datum.ProtobufDatumFactory; +import org.apache.tajo.exception.ValueTooLongForTypeCharactersException; import org.apache.tajo.storage.fragment.FileFragment; import org.apache.tajo.storage.rcfile.RCFile; import org.apache.tajo.storage.sequencefile.SequenceFileScanner; @@ -1011,4 +1011,50 @@ public void testLessThanSchemaSize() throws IOException { assertEquals(expect.get(1), tuple.get(1)); assertEquals(NullDatum.get(), tuple.get(4)); } + + @Test + public final void testInsertFixedCharTypeWithOverSize() throws Exception { + if (storeType.equalsIgnoreCase("CSV") == false && + storeType.equalsIgnoreCase("SEQUENCEFILE") == false && + storeType.equalsIgnoreCase("RCFILE") == false && + storeType.equalsIgnoreCase("PARQUET") == false) { + return; + } + + Schema dataSchema = new Schema(); + dataSchema.addColumn("col1", Type.CHAR); + + KeyValueSet options = new KeyValueSet(); + TableMeta meta = CatalogUtil.newTableMeta(storeType, options); + meta.setOptions(CatalogUtil.newPhysicalProperties(storeType)); + + Path tablePath = new Path(testDir, "test_storetype_oversize.data"); + FileStorageManager sm = (FileStorageManager) StorageManager.getFileStorageManager(conf); + Appender appender = sm.getAppender(meta, dataSchema, tablePath); + appender.init(); + + Tuple expect = new VTuple(dataSchema.size()); + expect.put(new Datum[]{ + DatumFactory.createChar("1"), + }); + + appender.addTuple(expect); + appender.flush(); + + Tuple expect2 = new VTuple(dataSchema.size()); + expect2.put(new Datum[]{ + DatumFactory.createChar("12"), + }); + + boolean ok = false; + try { + appender.addTuple(expect2); + appender.flush(); + appender.close(); + } catch (ValueTooLongForTypeCharactersException e) { + ok = true; + } + + assertTrue(ok); + } } From 568386dc890ad829314e6c23b0f782e51b821a50 Mon Sep 17 00:00:00 2001 From: DaeMyung Kang Date: Thu, 14 May 2015 17:12:42 +0900 Subject: [PATCH 034/141] TAJO-1485: Datum 'Char' returned only 1byte. (missing changes) Signed-off-by: Jihoon Son --- .../test/java/org/apache/tajo/catalog/TestTableMeta.java | 4 ++-- .../test/java/org/apache/tajo/storage/TestStorages.java | 9 +++++---- .../org/apache/tajo/storage/parquet/TestReadWrite.java | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/tajo-catalog/tajo-catalog-common/src/test/java/org/apache/tajo/catalog/TestTableMeta.java b/tajo-catalog/tajo-catalog-common/src/test/java/org/apache/tajo/catalog/TestTableMeta.java index 3d6b39094e..9006fb2eab 100644 --- a/tajo-catalog/tajo-catalog-common/src/test/java/org/apache/tajo/catalog/TestTableMeta.java +++ b/tajo-catalog/tajo-catalog-common/src/test/java/org/apache/tajo/catalog/TestTableMeta.java @@ -96,7 +96,7 @@ public void testEqualsObject2() { int MAX_COUNT = 17; - TableMeta meta1 = CatalogUtil.newTableMeta(StoreType.CSV.toString()); + TableMeta meta1 = CatalogUtil.newTableMeta(StoreType.CSV); for (int i = 0; i < MAX_COUNT; i++) { meta1.putOption("key"+i, "value"+i); } @@ -108,7 +108,7 @@ public void testEqualsObject2() { optionBuilder.addKeyval(keyValueBuilder); } TableProto.Builder builder = TableProto.newBuilder(); - builder.setStoreType(StoreType.CSV.toString()); + builder.setStoreType(StoreType.CSV); builder.setParams(optionBuilder); TableMeta meta2 = new TableMeta(builder.build()); assertTrue(meta1.equals(meta2)); diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java index 6a7cd94bf1..4d4e909701 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java @@ -30,6 +30,7 @@ import org.apache.tajo.catalog.CatalogUtil; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.catalog.proto.CatalogProtos.StoreType; import org.apache.tajo.catalog.statistics.TableStats; import org.apache.tajo.common.TajoDataTypes.Type; import org.apache.tajo.conf.TajoConf; @@ -1014,10 +1015,10 @@ public void testLessThanSchemaSize() throws IOException { @Test public final void testInsertFixedCharTypeWithOverSize() throws Exception { - if (storeType.equalsIgnoreCase("CSV") == false && - storeType.equalsIgnoreCase("SEQUENCEFILE") == false && - storeType.equalsIgnoreCase("RCFILE") == false && - storeType.equalsIgnoreCase("PARQUET") == false) { + if (!storeType.equals(StoreType.CSV) && + !storeType.equals(StoreType.SEQUENCEFILE) && + !storeType.equals(StoreType.RCFILE) && + !storeType.equals(StoreType.PARQUET)) { return; } diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/parquet/TestReadWrite.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/parquet/TestReadWrite.java index 109fed9a72..af0159e03b 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/parquet/TestReadWrite.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/parquet/TestReadWrite.java @@ -55,7 +55,7 @@ private Schema createAllTypesSchema() { List columns = new ArrayList(); columns.add(new Column("myboolean", Type.BOOLEAN)); columns.add(new Column("mybit", Type.BIT)); - columns.add(new Column("mychar", Type.CHAR)); + columns.add(new Column("mychar", Type.CHAR, 1)); columns.add(new Column("myint2", Type.INT2)); columns.add(new Column("myint4", Type.INT4)); columns.add(new Column("myint8", Type.INT8)); From 19554d80154735aaf0823884eb6d28061272d06a Mon Sep 17 00:00:00 2001 From: JaeHwa Jung Date: Thu, 14 May 2015 18:49:42 +0900 Subject: [PATCH 035/141] TAJO-1586: TajoMaster HA startup failure on Yarn. (jaehwa) --- CHANGES | 2 + .../org/apache/tajo/cli/tools/TajoAdmin.java | 6 +- .../apache/tajo/cli/tools/TajoHAAdmin.java | 13 +- .../tajo/client/DummyServiceTracker.java | 19 +- .../java/org/apache/tajo/conf/TajoConf.java | 2 + .../java/org/apache/tajo/ha/HAConstants.java | 1 + .../org/apache/tajo/ha/HAServiceUtil.java | 253 -------------- .../tajo/service/BaseServiceTracker.java | 31 +- .../apache/tajo/service/HAServiceTracker.java | 20 +- .../apache/tajo/service/ServiceTracker.java | 28 +- .../java/org/apache/tajo/util/FileUtil.java | 22 ++ .../apache/tajo/ha/HdfsServiceTracker.java | 322 ++++++++++-------- .../org/apache/tajo/master/TajoMaster.java | 21 +- .../java/org/apache/tajo/util/JSPUtil.java | 2 +- .../tajo/worker/TajoResourceAllocator.java | 1 - .../org/apache/tajo/worker/TajoWorker.java | 2 + .../resources/webapps/admin/catalogview.jsp | 10 +- .../main/resources/webapps/admin/cluster.jsp | 10 +- .../main/resources/webapps/admin/index.jsp | 10 +- .../main/resources/webapps/admin/query.jsp | 7 +- .../webapps/admin/query_executor.jsp | 9 +- .../apache/tajo/ha/TestHAServiceHDFSImpl.java | 26 +- 22 files changed, 363 insertions(+), 454 deletions(-) delete mode 100644 tajo-common/src/main/java/org/apache/tajo/ha/HAServiceUtil.java diff --git a/CHANGES b/CHANGES index 4f718a963a..39e2dfec9a 100644 --- a/CHANGES +++ b/CHANGES @@ -39,6 +39,8 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1586: TajoMaster HA startup failure on Yarn. (jaehwa) + TAJO-1485: Datum 'Char' returned only 1byte. (Contributed by DaeMyung Kang, Committed by jihoon) diff --git a/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoAdmin.java b/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoAdmin.java index 549743512d..6738489e77 100644 --- a/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoAdmin.java +++ b/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoAdmin.java @@ -30,7 +30,6 @@ import org.apache.tajo.service.ServiceTracker; import org.apache.tajo.service.ServiceTrackerFactory; import org.apache.tajo.util.NetUtils; -import org.apache.tajo.ha.HAServiceUtil; import org.apache.tajo.util.TajoIdUtils; import java.io.IOException; @@ -71,8 +70,8 @@ private enum WorkerStatus { private TajoConf tajoConf; private TajoClient tajoClient; - private ServiceTracker serviceTracker; private Writer writer; + private ServiceTracker serviceTracker; public TajoAdmin(TajoConf tajoConf, Writer writer) { this(tajoConf, writer, null); @@ -82,6 +81,7 @@ public TajoAdmin(TajoConf tajoConf, Writer writer, TajoClient tajoClient) { this.tajoConf = tajoConf; this.writer = writer; this.tajoClient = tajoClient; + serviceTracker = ServiceTrackerFactory.get(this.tajoConf); } private void printUsage() { @@ -427,7 +427,7 @@ private void processMasters(Writer writer) throws ParseException, IOException, if (tajoConf.getBoolVar(TajoConf.ConfVars.TAJO_MASTER_HA_ENABLE)) { - List list = HAServiceUtil.getMasters(tajoConf); + List list = serviceTracker.getMasters(tajoConf); int i = 0; for (String master : list) { if (i > 0) { diff --git a/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoHAAdmin.java b/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoHAAdmin.java index 127ee8c792..84fab33789 100644 --- a/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoHAAdmin.java +++ b/tajo-cli/src/main/java/org/apache/tajo/cli/tools/TajoHAAdmin.java @@ -21,9 +21,8 @@ import com.google.protobuf.ServiceException; import org.apache.commons.cli.*; import org.apache.tajo.client.TajoClient; -import org.apache.tajo.client.TajoClientImpl; import org.apache.tajo.conf.TajoConf; -import org.apache.tajo.ha.HAServiceUtil; +import org.apache.tajo.service.ServiceTracker; import org.apache.tajo.service.ServiceTrackerFactory; import java.io.IOException; @@ -44,8 +43,8 @@ public class TajoHAAdmin { } private TajoConf tajoConf; - private TajoClient tajoClient; private Writer writer; + private ServiceTracker serviceTracker; public TajoHAAdmin(TajoConf tajoConf, Writer writer) { this(tajoConf, writer, null); @@ -54,7 +53,6 @@ public TajoHAAdmin(TajoConf tajoConf, Writer writer) { public TajoHAAdmin(TajoConf tajoConf, Writer writer, TajoClient tajoClient) { this.tajoConf = tajoConf; this.writer = writer; - this.tajoClient = tajoClient; } private void printUsage() { @@ -127,9 +125,6 @@ public void runCommand(String[] args) throws Exception { return; } else if (hostName != null && port != null) { tajoConf.setVar(TajoConf.ConfVars.TAJO_MASTER_CLIENT_RPC_ADDRESS, hostName + ":" + port); - tajoClient = new TajoClientImpl(ServiceTrackerFactory.get(tajoConf)); - } else if (hostName == null && port == null) { - tajoClient = new TajoClientImpl(ServiceTrackerFactory.get(tajoConf)); } if (!tajoConf.getBoolVar(TajoConf.ConfVars.TAJO_MASTER_HA_ENABLE)) { @@ -160,7 +155,7 @@ public void runCommand(String[] args) throws Exception { private void getState(Writer writer, String param) throws ParseException, IOException, ServiceException { - int retValue = HAServiceUtil.getState(param, tajoConf); + int retValue = serviceTracker.getState(param, tajoConf); switch (retValue) { case 1: @@ -180,7 +175,7 @@ private void getState(Writer writer, String param) throws ParseException, IOExce private void formatHA(Writer writer) throws ParseException, IOException, ServiceException { - int retValue = HAServiceUtil.formatHA(tajoConf); + int retValue = serviceTracker.formatHA(tajoConf); switch (retValue) { case 1: diff --git a/tajo-client/src/main/java/org/apache/tajo/client/DummyServiceTracker.java b/tajo-client/src/main/java/org/apache/tajo/client/DummyServiceTracker.java index 762c2e730f..cf826ea285 100644 --- a/tajo-client/src/main/java/org/apache/tajo/client/DummyServiceTracker.java +++ b/tajo-client/src/main/java/org/apache/tajo/client/DummyServiceTracker.java @@ -18,6 +18,7 @@ package org.apache.tajo.client; +import org.apache.tajo.conf.TajoConf; import org.apache.tajo.exception.UnsupportedException; import org.apache.tajo.service.ServiceTracker; import org.apache.tajo.service.ServiceTrackerException; @@ -25,6 +26,7 @@ import java.io.IOException; import java.net.InetSocketAddress; +import java.util.ArrayList; import java.util.List; public class DummyServiceTracker implements ServiceTracker { @@ -64,6 +66,21 @@ public InetSocketAddress getMasterHttpInfo() throws ServiceTrackerException { throw new UnsupportedException(); } + @Override + public int getState(String masterName, TajoConf conf) throws ServiceTrackerException { + return 0; + } + + @Override + public int formatHA(TajoConf conf) throws ServiceTrackerException { + return 0; + } + + @Override + public List getMasters(TajoConf conf) throws ServiceTrackerException { + return new ArrayList(); + } + @Override public void register() throws IOException { } @@ -73,7 +90,7 @@ public void delete() throws IOException { } @Override - public boolean isActiveStatus() { + public boolean isActiveMaster() { return true; } diff --git a/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java b/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java index 4ed8097b4f..1cc1240a2f 100644 --- a/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java +++ b/tajo-common/src/main/java/org/apache/tajo/conf/TajoConf.java @@ -138,6 +138,8 @@ public static enum ConfVars implements ConfigKey { // High availability configurations TAJO_MASTER_HA_ENABLE("tajo.master.ha.enable", false, Validators.bool()), TAJO_MASTER_HA_MONITOR_INTERVAL("tajo.master.ha.monitor.interval", 5 * 1000), // 5 sec + TAJO_MASTER_HA_CLIENT_RETRY_MAX_NUM("tajo.master.ha.client.read.retry.max-num", 120), // 120 retry + TAJO_MASTER_HA_CLIENT_RETRY_PAUSE_TIME("tajo.master.ha.client.read.pause-time", 500), // 500 ms // Service discovery DEFAULT_SERVICE_TRACKER_CLASS("tajo.discovery.service-tracker.class", BaseServiceTracker.class.getCanonicalName()), diff --git a/tajo-common/src/main/java/org/apache/tajo/ha/HAConstants.java b/tajo-common/src/main/java/org/apache/tajo/ha/HAConstants.java index c5f4b8a749..7af19c6130 100644 --- a/tajo-common/src/main/java/org/apache/tajo/ha/HAConstants.java +++ b/tajo-common/src/main/java/org/apache/tajo/ha/HAConstants.java @@ -24,4 +24,5 @@ public class HAConstants { public final static int RESOURCE_TRACKER_RPC_ADDRESS = 3; public final static int CATALOG_ADDRESS = 4; public final static int MASTER_INFO_ADDRESS = 5; + public final static String ACTIVE_LOCK_FILE = "active.lock"; } diff --git a/tajo-common/src/main/java/org/apache/tajo/ha/HAServiceUtil.java b/tajo-common/src/main/java/org/apache/tajo/ha/HAServiceUtil.java deleted file mode 100644 index 700122801e..0000000000 --- a/tajo-common/src/main/java/org/apache/tajo/ha/HAServiceUtil.java +++ /dev/null @@ -1,253 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.ha; - -import org.apache.hadoop.fs.*; -import org.apache.tajo.TajoConstants; -import org.apache.tajo.conf.TajoConf; -import org.apache.tajo.util.NetUtils; - - -import javax.net.SocketFactory; -import java.io.IOException; -import java.net.InetSocketAddress; -import java.net.Socket; -import java.util.ArrayList; -import java.util.List; - -public class HAServiceUtil { - - public static InetSocketAddress getMasterClientAddress(TajoConf conf) { - return getMasterAddress(conf, HAConstants.MASTER_CLIENT_RPC_ADDRESS); - } - - public static String getMasterClientName(TajoConf conf) { - return NetUtils.normalizeInetSocketAddress(getMasterClientAddress(conf)); - } - - public static InetSocketAddress getMasterAddress(TajoConf conf, int type) { - InetSocketAddress masterAddress = null; - - if (conf.getBoolVar(TajoConf.ConfVars.TAJO_MASTER_HA_ENABLE)) { - try { - FileSystem fs = getFileSystem(conf); - Path activePath = new Path(TajoConf.getSystemHADir(conf), TajoConstants.SYSTEM_HA_ACTIVE_DIR_NAME); - - if (fs.exists(activePath)) { - FileStatus[] files = fs.listStatus(activePath); - - if (files.length == 1) { - Path file = files[0].getPath(); - String hostAddress = file.getName().replaceAll("_", ":"); - FSDataInputStream stream = fs.open(file); - String data = stream.readUTF(); - stream.close(); - - String[] addresses = data.split("_"); - - switch (type) { - case HAConstants.MASTER_UMBILICAL_RPC_ADDRESS: - masterAddress = NetUtils.createSocketAddr(hostAddress); - break; - case HAConstants.MASTER_CLIENT_RPC_ADDRESS: - masterAddress = NetUtils.createSocketAddr(addresses[0]); - break; - case HAConstants.RESOURCE_TRACKER_RPC_ADDRESS: - masterAddress = NetUtils.createSocketAddr(addresses[1]); - break; - case HAConstants.CATALOG_ADDRESS: - masterAddress = NetUtils.createSocketAddr(addresses[2]); - break; - case HAConstants.MASTER_INFO_ADDRESS: - masterAddress = NetUtils.createSocketAddr(addresses[3]); - break; - default: - break; - } - } - } - - } catch (Exception e) { - e.printStackTrace(); - } - } - - if (masterAddress == null) { - switch (type) { - case HAConstants.MASTER_UMBILICAL_RPC_ADDRESS: - masterAddress = NetUtils.createSocketAddr(conf.getVar(TajoConf.ConfVars - .TAJO_MASTER_UMBILICAL_RPC_ADDRESS)); - break; - case HAConstants.MASTER_CLIENT_RPC_ADDRESS: - masterAddress = NetUtils.createSocketAddr(conf.getVar(TajoConf.ConfVars - .TAJO_MASTER_CLIENT_RPC_ADDRESS)); - break; - case HAConstants.RESOURCE_TRACKER_RPC_ADDRESS: - masterAddress = NetUtils.createSocketAddr(conf.getVar(TajoConf.ConfVars - .RESOURCE_TRACKER_RPC_ADDRESS)); - break; - case HAConstants.CATALOG_ADDRESS: - masterAddress = NetUtils.createSocketAddr(conf.getVar(TajoConf.ConfVars - .CATALOG_ADDRESS)); - break; - case HAConstants.MASTER_INFO_ADDRESS: - masterAddress = NetUtils.createSocketAddr(conf.getVar(TajoConf.ConfVars - .TAJO_MASTER_INFO_ADDRESS)); - break; - default: - break; - } - } - - return masterAddress; - } - - public static boolean isMasterAlive(String masterName, TajoConf conf) { - boolean isAlive = true; - - try { - // how to create sockets - SocketFactory socketFactory = org.apache.hadoop.net.NetUtils.getDefaultSocketFactory(conf); - - int connectionTimeout = conf.getInt(CommonConfigurationKeys.IPC_CLIENT_CONNECT_TIMEOUT_KEY, - CommonConfigurationKeys.IPC_CLIENT_CONNECT_TIMEOUT_DEFAULT); - - InetSocketAddress server = org.apache.hadoop.net.NetUtils.createSocketAddr(masterName); - - // connected socket - Socket socket = socketFactory.createSocket(); - org.apache.hadoop.net.NetUtils.connect(socket, server, connectionTimeout); - } catch (Exception e) { - isAlive = false; - } - return isAlive; - } - - public static int getState(String masterName, TajoConf conf) { - String targetMaster = masterName.replaceAll(":", "_"); - int retValue = -1; - - try { - FileSystem fs = getFileSystem(conf); - Path activePath = new Path(TajoConf.getSystemHADir(conf), TajoConstants.SYSTEM_HA_ACTIVE_DIR_NAME); - Path backupPath = new Path(TajoConf.getSystemHADir(conf), TajoConstants.SYSTEM_HA_BACKUP_DIR_NAME); - - Path temPath = null; - - // Check backup masters - FileStatus[] files = fs.listStatus(backupPath); - for (FileStatus status : files) { - temPath = status.getPath(); - if (temPath.getName().equals(targetMaster)) { - return 0; - } - } - - // Check active master - files = fs.listStatus(activePath); - if (files.length == 1) { - temPath = files[0].getPath(); - if (temPath.getName().equals(targetMaster)) { - return 1; - } - } - retValue = -2; - } catch (Exception e) { - e.printStackTrace(); - } - return retValue; - } - - public static int formatHA(TajoConf conf) { - int retValue = -1; - try { - FileSystem fs = getFileSystem(conf); - Path activePath = new Path(TajoConf.getSystemHADir(conf), TajoConstants.SYSTEM_HA_ACTIVE_DIR_NAME); - Path backupPath = new Path(TajoConf.getSystemHADir(conf), TajoConstants.SYSTEM_HA_BACKUP_DIR_NAME); - Path temPath = null; - - int aliveMasterCount = 0; - // Check backup masters - FileStatus[] files = fs.listStatus(backupPath); - for (FileStatus status : files) { - temPath = status.getPath(); - if (isMasterAlive(temPath.getName().replaceAll("_", ":"), conf)) { - aliveMasterCount++; - } - } - - // Check active master - files = fs.listStatus(activePath); - if (files.length == 1) { - temPath = files[0].getPath(); - if (isMasterAlive(temPath.getName().replaceAll("_", ":"), conf)) { - aliveMasterCount++; - } - } - - // If there is any alive master, users can't format storage. - if (aliveMasterCount > 0) { - return 0; - } - - // delete ha path. - fs.delete(TajoConf.getSystemHADir(conf), true); - retValue = 1; - } catch (Exception e) { - e.printStackTrace(); - } - return retValue; - } - - - public static List getMasters(TajoConf conf) { - List list = new ArrayList(); - - try { - FileSystem fs = getFileSystem(conf); - Path activePath = new Path(TajoConf.getSystemHADir(conf), TajoConstants.SYSTEM_HA_ACTIVE_DIR_NAME); - Path backupPath = new Path(TajoConf.getSystemHADir(conf), TajoConstants.SYSTEM_HA_BACKUP_DIR_NAME); - Path temPath = null; - - // Check backup masters - FileStatus[] files = fs.listStatus(backupPath); - for (FileStatus status : files) { - temPath = status.getPath(); - list.add(temPath.getName().replaceAll("_", ":")); - } - - // Check active master - files = fs.listStatus(activePath); - if (files.length == 1) { - temPath = files[0].getPath(); - list.add(temPath.getName().replaceAll("_", ":")); - } - - } catch (Exception e) { - e.printStackTrace(); - } - return list; - } - - private static FileSystem getFileSystem(TajoConf conf) throws IOException { - Path rootPath = TajoConf.getTajoRootDir(conf); - return rootPath.getFileSystem(conf); - } - -} diff --git a/tajo-common/src/main/java/org/apache/tajo/service/BaseServiceTracker.java b/tajo-common/src/main/java/org/apache/tajo/service/BaseServiceTracker.java index bf7fd2c45f..e598f2a2a3 100644 --- a/tajo-common/src/main/java/org/apache/tajo/service/BaseServiceTracker.java +++ b/tajo-common/src/main/java/org/apache/tajo/service/BaseServiceTracker.java @@ -76,6 +76,29 @@ public InetSocketAddress getMasterHttpInfo() throws ServiceTrackerException { return tajoMasterInfo.getWebServerAddress(); } + @Override + public int getState(String masterName, TajoConf conf) throws ServiceTrackerException { + String masterAddress = getMasterAddress(); + + if (masterAddress.equals(masterName)) { + return 1; + } else { + return 0; + } + } + + @Override + public int formatHA(TajoConf conf) throws ServiceTrackerException { + throw new ServiceTrackerException("Cannot format HA directories on non-HA mode"); + } + + @Override + public List getMasters(TajoConf conf) throws ServiceTrackerException { + List list = TUtil.newList(); + list.add(getMasterAddress()); + return list; + } + @Override public void register() throws IOException { } @@ -85,7 +108,7 @@ public void delete() throws IOException { } @Override - public boolean isActiveStatus() { + public boolean isActiveMaster() { return true; } @@ -94,4 +117,10 @@ public List getMasters() throws IOException { return tajoMasterInfos; } + private String getMasterAddress() { + String masterAddress = tajoMasterInfo.getTajoMasterAddress().getAddress().getHostAddress() + ":" + tajoMasterInfo + .getTajoMasterAddress().getPort(); + + return masterAddress; + } } diff --git a/tajo-common/src/main/java/org/apache/tajo/service/HAServiceTracker.java b/tajo-common/src/main/java/org/apache/tajo/service/HAServiceTracker.java index c80853710e..081b1530a3 100644 --- a/tajo-common/src/main/java/org/apache/tajo/service/HAServiceTracker.java +++ b/tajo-common/src/main/java/org/apache/tajo/service/HAServiceTracker.java @@ -18,13 +18,18 @@ package org.apache.tajo.service; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.net.NetUtils; +import org.apache.tajo.conf.TajoConf; +import org.apache.tajo.util.FileUtil; import javax.net.SocketFactory; import java.net.InetSocketAddress; import java.net.Socket; public abstract class HAServiceTracker implements ServiceTracker { + private static final Log LOG = LogFactory.getLog(HAServiceTracker.class); static SocketFactory socketFactory = SocketFactory.getDefault(); @@ -32,16 +37,29 @@ public boolean isHighAvailable() { return true; } + public static boolean checkConnection(String address) { + return checkConnection(address, ":"); + } + + public static boolean checkConnection(String address, String delimiter) { + String[] hostAddress = address.split(delimiter); + InetSocketAddress socketAddress = new InetSocketAddress(hostAddress[0], Integer.parseInt(hostAddress[1])); + return checkConnection(socketAddress); + } + public static boolean checkConnection(InetSocketAddress address) { boolean isAlive = true; + Socket socket = null; try { int connectionTimeout = 10; - Socket socket = socketFactory.createSocket(); + socket = socketFactory.createSocket(); NetUtils.connect(socket, address, connectionTimeout); } catch (Exception e) { isAlive = false; + } finally { + FileUtil.cleanup(LOG, socket); } return isAlive; } diff --git a/tajo-common/src/main/java/org/apache/tajo/service/ServiceTracker.java b/tajo-common/src/main/java/org/apache/tajo/service/ServiceTracker.java index 73ff112231..5888ff39c8 100644 --- a/tajo-common/src/main/java/org/apache/tajo/service/ServiceTracker.java +++ b/tajo-common/src/main/java/org/apache/tajo/service/ServiceTracker.java @@ -18,46 +18,54 @@ package org.apache.tajo.service; +import org.apache.tajo.conf.TajoConf; + import java.io.IOException; import java.net.InetSocketAddress; import java.util.List; public interface ServiceTracker { - public abstract boolean isHighAvailable(); + boolean isHighAvailable(); + + InetSocketAddress getUmbilicalAddress() throws ServiceTrackerException; + + InetSocketAddress getClientServiceAddress() throws ServiceTrackerException; + + InetSocketAddress getResourceTrackerAddress() throws ServiceTrackerException; - public abstract InetSocketAddress getUmbilicalAddress() throws ServiceTrackerException; + InetSocketAddress getCatalogAddress() throws ServiceTrackerException; - public abstract InetSocketAddress getClientServiceAddress() throws ServiceTrackerException; + InetSocketAddress getMasterHttpInfo() throws ServiceTrackerException; - public abstract InetSocketAddress getResourceTrackerAddress() throws ServiceTrackerException; + int getState(String masterName, TajoConf conf) throws ServiceTrackerException; - public abstract InetSocketAddress getCatalogAddress() throws ServiceTrackerException; + int formatHA(TajoConf conf) throws ServiceTrackerException; - public abstract InetSocketAddress getMasterHttpInfo() throws ServiceTrackerException; + List getMasters(TajoConf conf) throws ServiceTrackerException; /** * Add master name to shared storage. */ - public void register() throws IOException; + void register() throws IOException; /** * Delete master name to shared storage. * */ - public void delete() throws IOException; + void delete() throws IOException; /** * * @return True if current master is an active master. */ - public boolean isActiveStatus(); + boolean isActiveMaster(); /** * * @return return all master list * @throws IOException */ - public List getMasters() throws IOException; + List getMasters() throws IOException; } diff --git a/tajo-common/src/main/java/org/apache/tajo/util/FileUtil.java b/tajo-common/src/main/java/org/apache/tajo/util/FileUtil.java index 9aa6af944c..0f17926156 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/FileUtil.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/FileUtil.java @@ -19,6 +19,7 @@ package org.apache.tajo.util; import com.google.protobuf.Message; +import org.apache.commons.logging.Log; import org.apache.hadoop.fs.*; import org.apache.hadoop.io.IOUtils; @@ -130,4 +131,25 @@ public static String humanReadableByteCount(long bytes, boolean si) { public static boolean isLocalPath(Path path) { return path.toUri().getScheme().equals("file"); } + + /** + * Close the Closeable objects and ignore any {@link IOException} or + * null pointers. Must only be used for cleanup in exception handlers. + * + * @param log the log to record problems to at debug level. Can be null. + * @param closeables the objects to close + */ + public static void cleanup(Log log, java.io.Closeable... closeables) { + for (java.io.Closeable c : closeables) { + if (c != null) { + try { + c.close(); + } catch(IOException e) { + if (log != null && log.isDebugEnabled()) { + log.debug("Exception in closing " + c, e); + } + } + } + } + } } diff --git a/tajo-core/src/main/java/org/apache/tajo/ha/HdfsServiceTracker.java b/tajo-core/src/main/java/org/apache/tajo/ha/HdfsServiceTracker.java index 4a782ec20d..5f1aff8b9d 100644 --- a/tajo-core/src/main/java/org/apache/tajo/ha/HdfsServiceTracker.java +++ b/tajo-core/src/main/java/org/apache/tajo/ha/HdfsServiceTracker.java @@ -24,6 +24,7 @@ import org.apache.hadoop.fs.*; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.net.NetUtils; +import org.apache.hadoop.util.ShutdownHookManager; import org.apache.tajo.TajoConstants; import org.apache.tajo.conf.TajoConf; import org.apache.tajo.conf.TajoConf.ConfVars; @@ -31,7 +32,8 @@ import org.apache.tajo.service.HAServiceTracker; import org.apache.tajo.service.ServiceTrackerException; import org.apache.tajo.service.TajoMasterInfo; -import org.apache.tajo.util.TUtil; +import org.apache.tajo.util.*; +import org.apache.tajo.util.FileUtil; import javax.net.SocketFactory; import java.io.IOException; @@ -58,7 +60,7 @@ public class HdfsServiceTracker extends HAServiceTracker { private Path activePath; private Path backupPath; - private boolean isActiveStatus = false; + private boolean isActiveMaster = false; //thread which runs periodically to see the last time since a heartbeat is received. private Thread checkerThread; @@ -74,8 +76,7 @@ public HdfsServiceTracker(TajoConf conf) throws IOException { InetSocketAddress socketAddress = conf.getSocketAddrVar(ConfVars.TAJO_MASTER_UMBILICAL_RPC_ADDRESS); this.masterName = socketAddress.getAddress().getHostAddress() + ":" + socketAddress.getPort(); - - monitorInterval = conf.getIntVar(ConfVars.TAJO_MASTER_HA_MONITOR_INTERVAL); + this.monitorInterval = conf.getIntVar(ConfVars.TAJO_MASTER_HA_MONITOR_INTERVAL); } private void initSystemDirectory() throws IOException { @@ -113,87 +114,144 @@ private void startPingChecker() { } } + /** + * It will creates the following form string. It includes + * + *
+   * {CLIENT_RPC_HOST:PORT}_{RESOURCE_TRACKER_HOST:PORT}_{CATALOG_HOST:PORT}_{MASTER_WEB_HOST:PORT}
+   * 
+ * + * @throws IOException + */ @Override public void register() throws IOException { - FileStatus[] files = fs.listStatus(activePath); + // Check lock file + boolean lockResult = createLockFile(); + + String fileName = masterName.replaceAll(":", "_"); + Path activeFile = new Path(activePath, fileName); + Path backupFile = new Path(backupPath, fileName); + + // Set TajoMasterInfo object which has several rpc server addresses. + StringBuilder sb = new StringBuilder(); + InetSocketAddress address = getHostAddress(HAConstants.MASTER_UMBILICAL_RPC_ADDRESS); + sb.append(address.getAddress().getHostAddress()).append(":").append(address.getPort()).append("_"); + + address = getHostAddress(HAConstants.MASTER_CLIENT_RPC_ADDRESS); + sb.append(address.getAddress().getHostAddress()).append(":").append(address.getPort()).append("_"); + + address = getHostAddress(HAConstants.RESOURCE_TRACKER_RPC_ADDRESS); + sb.append(address.getAddress().getHostAddress()).append(":").append(address.getPort()).append("_"); + + address = getHostAddress(HAConstants.CATALOG_ADDRESS); + sb.append(address.getAddress().getHostAddress()).append(":").append(address.getPort()).append("_"); + + address = getHostAddress(HAConstants.MASTER_INFO_ADDRESS); + sb.append(address.getAddress().getHostAddress()).append(":").append(address.getPort()); // Phase 1: If there is not another active master, this try to become active master. - if (files.length == 0) { - createMasterFile(true); + if (lockResult) { + fs.delete(backupFile, false); + createMasterFile(activeFile, sb); currentActiveMaster = masterName; + writeSystemConf(); LOG.info(String.format("This is added to active master (%s)", masterName)); } else { // Phase 2: If there is active master information, we need to check its status. - Path activePath = files[0].getPath(); - currentActiveMaster = activePath.getName().replaceAll("_", ":"); + FileStatus[] files = fs.listStatus(activePath); + Path existingActiveFile = null; + if (files.length > 2) { + throw new ServiceTrackerException("Three or more than active master entries."); + } + for(FileStatus eachFile : files) { + if (!eachFile.getPath().getName().equals(HAConstants.ACTIVE_LOCK_FILE)) { + existingActiveFile = eachFile.getPath(); + } + } + currentActiveMaster = existingActiveFile.getName().replaceAll("_", ":"); // Phase 3: If current active master is dead, this master should be active master. - if (!HAServiceUtil.isMasterAlive(currentActiveMaster, conf)) { - fs.delete(activePath, true); - createMasterFile(true); + if (!checkConnection(currentActiveMaster)) { + fs.delete(existingActiveFile, false); + fs.delete(backupFile, false); + createMasterFile(activeFile, sb); currentActiveMaster = masterName; LOG.info(String.format("This is added to active master (%s)", masterName)); } else { // Phase 4: If current active master is alive, this master need to be backup master. - createMasterFile(false); - LOG.info(String.format("This is added to backup masters (%s)", masterName)); + if (masterName.equals(currentActiveMaster)) { + LOG.info(String.format("This has already been added to active master (%s)", masterName)); + } else { + if (fs.exists(backupFile)) { + LOG.info(String.format("This has already been added to backup masters (%s)", masterName)); + } else { + createMasterFile(backupFile, sb); + LOG.info(String.format("This is added to backup master (%s)", masterName)); + } + } } } + startPingChecker(); } /** - * It will creates the following form string. It includes + * Storing the system configs * - *
-   * {CLIENT_RPC_HOST:PORT}_{RESOURCE_TRACKER_HOST:PORT}_{CATALOG_HOST:PORT}_{MASTER_WEB_HOST:PORT}
-   * 
- * - * @param isActive A boolean flag to indicate if it is for master or not. * @throws IOException */ - private void createMasterFile(boolean isActive) throws IOException { - String fileName = masterName.replaceAll(":", "_"); - Path path = null; + private void writeSystemConf() throws IOException { + Path systemConfPath = TajoConf.getSystemConfPath(conf); - if (isActive) { - path = new Path(activePath, fileName); - } else { - path = new Path(backupPath, fileName); + FSDataOutputStream out = FileSystem.create(fs, systemConfPath, + new FsPermission(TajoMaster.SYSTEM_CONF_FILE_PERMISSION)); + try { + conf.writeXml(out); + } finally { + out.close(); } + fs.setReplication(systemConfPath, (short) conf.getIntVar(ConfVars.SYSTEM_CONF_REPLICA_COUNT)); + } - StringBuilder sb = new StringBuilder(); - InetSocketAddress address = getHostAddress(HAConstants.MASTER_CLIENT_RPC_ADDRESS); - sb.append(address.getAddress().getHostAddress()).append(":").append(address.getPort()).append("_"); - - address = getHostAddress(HAConstants.RESOURCE_TRACKER_RPC_ADDRESS); - sb.append(address.getAddress().getHostAddress()).append(":").append(address.getPort()).append("_"); - - address = getHostAddress(HAConstants.CATALOG_ADDRESS); - sb.append(address.getAddress().getHostAddress()).append(":").append(address.getPort()).append("_"); + private boolean createLockFile() throws IOException { + boolean result = false; + FSDataOutputStream lockOutput = null; - address = getHostAddress(HAConstants.MASTER_INFO_ADDRESS); - sb.append(address.getAddress().getHostAddress()).append(":").append(address.getPort()); + Path lockFile = new Path(activePath, HAConstants.ACTIVE_LOCK_FILE); + try { + lockOutput = fs.create(lockFile, false); + lockOutput.hsync(); + lockOutput.close(); + fs.deleteOnExit(lockFile); + result = true; + } catch (FileAlreadyExistsException e) { + LOG.info(String.format("Lock file already exists at (%s)", lockFile.toString())); + result = false; + } catch (Exception e) { + throw new IOException("Lock file creation is failed - " + e.getMessage()); + } finally { + FileUtil.cleanup(LOG, lockOutput); + } - FSDataOutputStream out = fs.create(path); + return result; + } + private void createMasterFile(Path path, StringBuilder sb) throws IOException { + FSDataOutputStream out = null; try { + out = fs.create(path, false); + out.writeUTF(sb.toString()); out.hsync(); out.close(); - } catch (FileAlreadyExistsException e) { - createMasterFile(false); - } - if (isActive) { - isActiveStatus = true; - } else { - isActiveStatus = false; + fs.deleteOnExit(path); + } catch (Exception e) { + throw new IOException("File creation is failed - " + e.getMessage()); + } finally { + FileUtil.cleanup(LOG, out); } - - startPingChecker(); } - private InetSocketAddress getHostAddress(int type) { InetSocketAddress address = null; @@ -226,65 +284,61 @@ private InetSocketAddress getHostAddress(int type) { @Override public void delete() throws IOException { + if (ShutdownHookManager.get().isShutdownInProgress()) return; + String fileName = masterName.replaceAll(":", "_"); - Path activeFile = new Path(activePath, fileName); - if (fs.exists(activeFile)) { - fs.delete(activeFile, true); - } + fs.delete(new Path(activePath, fileName), false); + fs.delete(new Path(activePath, HAConstants.ACTIVE_LOCK_FILE), false); + fs.delete(new Path(backupPath, fileName), false); - Path backupFile = new Path(backupPath, fileName); - if (fs.exists(backupFile)) { - fs.delete(backupFile, true); - } - if (isActiveStatus) { - isActiveStatus = false; - } stopped = true; } @Override - public boolean isActiveStatus() { - return isActiveStatus; + public boolean isActiveMaster() { + if (currentActiveMaster.equals(masterName)) { + return true; + } else { + return false; + } } @Override public List getMasters() throws IOException { List list = TUtil.newList(); - Path path = null; FileStatus[] files = fs.listStatus(activePath); - if (files.length == 1) { - path = files[0].getPath(); - list.add(createTajoMasterInfo(path, true)); + for(FileStatus status : files) { + if (!status.getPath().getName().equals(HAConstants.ACTIVE_LOCK_FILE)) { + list.add(getTajoMasterInfo(status.getPath(), true)); + } } files = fs.listStatus(backupPath); for (FileStatus status : files) { - path = status.getPath(); - list.add(createTajoMasterInfo(path, false)); + list.add(getTajoMasterInfo(status.getPath(), false)); } return list; } - private TajoMasterInfo createTajoMasterInfo(Path path, boolean isActive) throws IOException { + private TajoMasterInfo getTajoMasterInfo(Path path, boolean isActive) throws IOException { String masterAddress = path.getName().replaceAll("_", ":"); - boolean isAlive = HAServiceUtil.isMasterAlive(masterAddress, conf); + boolean isAlive = checkConnection(masterAddress); FSDataInputStream stream = fs.open(path); String data = stream.readUTF(); - stream.close(); String[] addresses = data.split("_"); TajoMasterInfo info = new TajoMasterInfo(); - info.setTajoMasterAddress(NetUtils.createSocketAddr(masterAddress)); - info.setTajoClientAddress(NetUtils.createSocketAddr(addresses[0])); - info.setWorkerResourceTrackerAddr(NetUtils.createSocketAddr(addresses[1])); - info.setCatalogAddress(NetUtils.createSocketAddr(addresses[2])); - info.setWebServerAddress(NetUtils.createSocketAddr(addresses[3])); + info.setTajoMasterAddress(NetUtils.createSocketAddr(addresses[0])); + info.setTajoClientAddress(NetUtils.createSocketAddr(addresses[1])); + info.setWorkerResourceTrackerAddr(NetUtils.createSocketAddr(addresses[2])); + info.setCatalogAddress(NetUtils.createSocketAddr(addresses[3])); + info.setWebServerAddress(NetUtils.createSocketAddr(addresses[4])); info.setAvailable(isAlive); info.setActive(isActive); @@ -299,21 +353,18 @@ public void run() { synchronized (HdfsServiceTracker.this) { try { if (!currentActiveMaster.equals(masterName)) { - boolean isAlive = HAServiceUtil.isMasterAlive(currentActiveMaster, conf); if (LOG.isDebugEnabled()) { - LOG.debug("currentActiveMaster:" + currentActiveMaster + ", thisMasterName:" + masterName - + ", isAlive:" + isAlive); + LOG.debug("currentActiveMaster:" + currentActiveMaster + ", thisMasterName:" + masterName); } // If active master is dead, this master should be active master instead of // previous active master. - if (!isAlive) { - FileStatus[] files = fs.listStatus(activePath); - if (files.length == 0 || (files.length == 1 - && currentActiveMaster.equals(files[0].getPath().getName().replaceAll("_", ":")))) { - delete(); - register(); - } + if (!checkConnection(currentActiveMaster)) { + Path activeFile = new Path(activePath, currentActiveMaster.replaceAll(":", "_")); + fs.delete(activeFile, false); + Path lockFile = new Path(activePath, HAConstants.ACTIVE_LOCK_FILE); + fs.delete(lockFile, false); + register(); } } } catch (Exception e) { @@ -345,7 +396,7 @@ public void run() { @Override public InetSocketAddress getUmbilicalAddress() { if (!checkConnection(umbilicalRpcAddr)) { - umbilicalRpcAddr = NetUtils.createSocketAddr(getAddressElements(conf).get(MASTER_UMBILICAL_RPC_ADDRESS)); + umbilicalRpcAddr = NetUtils.createSocketAddr(getAddressElements().get(MASTER_UMBILICAL_RPC_ADDRESS)); } return umbilicalRpcAddr; @@ -354,7 +405,7 @@ public InetSocketAddress getUmbilicalAddress() { @Override public InetSocketAddress getClientServiceAddress() { if (!checkConnection(clientRpcAddr)) { - clientRpcAddr = NetUtils.createSocketAddr(getAddressElements(conf).get(MASTER_CLIENT_RPC_ADDRESS)); + clientRpcAddr = NetUtils.createSocketAddr(getAddressElements().get(MASTER_CLIENT_RPC_ADDRESS)); } return clientRpcAddr; @@ -363,7 +414,7 @@ public InetSocketAddress getClientServiceAddress() { @Override public InetSocketAddress getResourceTrackerAddress() { if (!checkConnection(resourceTrackerRpcAddr)) { - resourceTrackerRpcAddr = NetUtils.createSocketAddr(getAddressElements(conf).get(RESOURCE_TRACKER_RPC_ADDRESS)); + resourceTrackerRpcAddr = NetUtils.createSocketAddr(getAddressElements().get(RESOURCE_TRACKER_RPC_ADDRESS)); } return resourceTrackerRpcAddr; @@ -372,7 +423,7 @@ public InetSocketAddress getResourceTrackerAddress() { @Override public InetSocketAddress getCatalogAddress() { if (!checkConnection(catalogAddr)) { - catalogAddr = NetUtils.createSocketAddr(getAddressElements(conf).get(CATALOG_ADDRESS)); + catalogAddr = NetUtils.createSocketAddr(getAddressElements().get(CATALOG_ADDRESS)); } return catalogAddr; @@ -381,7 +432,7 @@ public InetSocketAddress getCatalogAddress() { @Override public InetSocketAddress getMasterHttpInfo() throws ServiceTrackerException { if (!checkConnection(masterHttpInfoAddr)) { - masterHttpInfoAddr = NetUtils.createSocketAddr(getAddressElements(conf).get(MASTER_HTTP_INFO)); + masterHttpInfoAddr = NetUtils.createSocketAddr(getAddressElements().get(MASTER_HTTP_INFO)); } return masterHttpInfoAddr; @@ -390,11 +441,10 @@ public InetSocketAddress getMasterHttpInfo() throws ServiceTrackerException { /** * Reads a text file stored in HDFS file, and then return all service addresses read from a HDFS file. * * - * @param conf * @return all service addresses * @throws ServiceTrackerException */ - private static List getAddressElements(TajoConf conf) throws ServiceTrackerException { + private synchronized List getAddressElements() throws ServiceTrackerException { try { FileSystem fs = getFileSystem(conf); @@ -408,15 +458,34 @@ private static List getAddressElements(TajoConf conf) throws ServiceTrac } FileStatus[] files = fs.listStatus(activeMasterBaseDir); + /* wait for active master from HDFS */ + int pause = conf.getIntVar(ConfVars.TAJO_MASTER_HA_CLIENT_RETRY_PAUSE_TIME); + int maxRetry = conf.getIntVar(ConfVars.TAJO_MASTER_HA_CLIENT_RETRY_MAX_NUM); + int retry = 0; + + while (files.length < 2 && retry < maxRetry) { + try { + this.wait(pause); + } catch (InterruptedException e) { + throw new ServiceTrackerException(e); + } + files = fs.listStatus(activeMasterBaseDir); + } if (files.length < 1) { + LOG.error("Exceeded the maximum retry (" + maxRetry + ") to read TajoMaster address from HDFS"); throw new ServiceTrackerException("No active master entry"); - } else if (files.length > 1) { - throw new ServiceTrackerException("Two or more than active master entries."); + } else if (files.length > 2) { + throw new ServiceTrackerException("Three or more than active master entries."); } - // We can ensure that there is only one file due to the above assertion. - Path activeMasterEntry = files[0].getPath(); + Path activeMasterEntry = null; + + for (FileStatus eachFile : files) { + if (!eachFile.getPath().getName().equals(HAConstants.ACTIVE_LOCK_FILE)) { + activeMasterEntry = eachFile.getPath(); + } + } if (!fs.isFile(activeMasterEntry)) { throw new ServiceTrackerException("Active master entry must be a file, but it is a directory."); @@ -424,12 +493,9 @@ private static List getAddressElements(TajoConf conf) throws ServiceTrac List addressElements = TUtil.newList(); - addressElements.add(activeMasterEntry.getName().replaceAll("_", ":")); // Add UMBILICAL_RPC_ADDRESS to elements - FSDataInputStream stream = fs.open(activeMasterEntry); String data = stream.readUTF(); stream.close(); - addressElements.addAll(TUtil.newList(data.split("_"))); // Add remains entries to elements // ensure the number of entries @@ -442,33 +508,8 @@ private static List getAddressElements(TajoConf conf) throws ServiceTrac } } - - public static boolean isMasterAlive(InetSocketAddress masterAddress, TajoConf conf) { - return isMasterAlive(org.apache.tajo.util.NetUtils.normalizeInetSocketAddress(masterAddress), conf); - } - - public static boolean isMasterAlive(String masterName, TajoConf conf) { - boolean isAlive = true; - - try { - // how to create sockets - SocketFactory socketFactory = org.apache.hadoop.net.NetUtils.getDefaultSocketFactory(conf); - - int connectionTimeout = conf.getInt(CommonConfigurationKeys.IPC_CLIENT_CONNECT_TIMEOUT_KEY, - CommonConfigurationKeys.IPC_CLIENT_CONNECT_TIMEOUT_DEFAULT); - - InetSocketAddress server = org.apache.hadoop.net.NetUtils.createSocketAddr(masterName); - - // connected socket - Socket socket = socketFactory.createSocket(); - org.apache.hadoop.net.NetUtils.connect(socket, server, connectionTimeout); - } catch (Exception e) { - isAlive = false; - } - return isAlive; - } - - public static int getState(String masterName, TajoConf conf) { + @Override + public int getState(String masterName, TajoConf conf) throws ServiceTrackerException { String targetMaster = masterName.replaceAll(":", "_"); int retValue = -1; @@ -498,12 +539,13 @@ public static int getState(String masterName, TajoConf conf) { } retValue = -2; } catch (Exception e) { - e.printStackTrace(); + throw new ServiceTrackerException("Cannot get HA state - ERROR:" + e.getMessage()); } return retValue; } - public static int formatHA(TajoConf conf) { + @Override + public int formatHA(TajoConf conf) throws ServiceTrackerException{ int retValue = -1; try { FileSystem fs = getFileSystem(conf); @@ -512,20 +554,20 @@ public static int formatHA(TajoConf conf) { Path temPath = null; int aliveMasterCount = 0; + // Check backup masters FileStatus[] files = fs.listStatus(backupPath); - for (FileStatus status : files) { - temPath = status.getPath(); - if (isMasterAlive(temPath.getName().replaceAll("_", ":"), conf)) { + for (FileStatus eachFile : files) { + if (checkConnection(eachFile.getPath().getName(), "_")) { aliveMasterCount++; } } // Check active master files = fs.listStatus(activePath); - if (files.length == 1) { - temPath = files[0].getPath(); - if (isMasterAlive(temPath.getName().replaceAll("_", ":"), conf)) { + for (FileStatus eachFile : files) { + if (!eachFile.getPath().getName().equals(HAConstants.ACTIVE_LOCK_FILE) && + checkConnection(eachFile.getPath().getName(), "_")) { aliveMasterCount++; } } @@ -539,13 +581,13 @@ public static int formatHA(TajoConf conf) { fs.delete(TajoConf.getSystemHADir(conf), true); retValue = 1; } catch (Exception e) { - e.printStackTrace(); + throw new ServiceTrackerException("Cannot format HA directories - ERROR:" + e.getMessage()); } return retValue; } - - public static List getMasters(TajoConf conf) { + @Override + public List getMasters(TajoConf conf) throws ServiceTrackerException { List list = new ArrayList(); try { @@ -569,7 +611,7 @@ public static List getMasters(TajoConf conf) { } } catch (Exception e) { - e.printStackTrace(); + throw new ServiceTrackerException("Cannot get master lists - ERROR:" + e.getMessage()); } return list; } @@ -578,4 +620,4 @@ private static FileSystem getFileSystem(TajoConf conf) throws IOException { Path rootPath = TajoConf.getTajoRootDir(conf); return rootPath.getFileSystem(conf); } -} +} \ No newline at end of file diff --git a/tajo-core/src/main/java/org/apache/tajo/master/TajoMaster.java b/tajo-core/src/main/java/org/apache/tajo/master/TajoMaster.java index 371dfb4779..e6d22fe0d6 100644 --- a/tajo-core/src/main/java/org/apache/tajo/master/TajoMaster.java +++ b/tajo-core/src/main/java/org/apache/tajo/master/TajoMaster.java @@ -228,11 +228,6 @@ private void initWebServer() throws Exception { } } - public boolean isActiveMaster() { - return (haService != null ? haService.isActiveStatus() : true); - } - - private void checkAndInitializeSystemDirectories() throws IOException { // Get Tajo root dir this.tajoRootPath = TajoConf.getTajoRootDir(systemConf); @@ -342,14 +337,18 @@ private void writeSystemConf() throws IOException { defaultFS.delete(systemConfPath, false); } - FSDataOutputStream out = FileSystem.create(defaultFS, systemConfPath, + // In TajoMaster HA, some master might see LeaseExpiredException because of lease mismatch. Thus, + // we need to create below xml file at HdfsServiceTracker::writeSystemConf. + if (!systemConf.getBoolVar(TajoConf.ConfVars.TAJO_MASTER_HA_ENABLE)) { + FSDataOutputStream out = FileSystem.create(defaultFS, systemConfPath, new FsPermission(SYSTEM_CONF_FILE_PERMISSION)); - try { - systemConf.writeXml(out); - } finally { - out.close(); + try { + systemConf.writeXml(out); + } finally { + out.close(); + } + defaultFS.setReplication(systemConfPath, (short) systemConf.getIntVar(ConfVars.SYSTEM_CONF_REPLICA_COUNT)); } - defaultFS.setReplication(systemConfPath, (short) systemConf.getIntVar(ConfVars.SYSTEM_CONF_REPLICA_COUNT)); } private void checkBaseTBSpaceAndDatabase() throws IOException { diff --git a/tajo-core/src/main/java/org/apache/tajo/util/JSPUtil.java b/tajo-core/src/main/java/org/apache/tajo/util/JSPUtil.java index aee2ced973..578b15aabc 100644 --- a/tajo-core/src/main/java/org/apache/tajo/util/JSPUtil.java +++ b/tajo-core/src/main/java/org/apache/tajo/util/JSPUtil.java @@ -197,7 +197,7 @@ public static String getMasterActiveLabel(MasterContext context) { ServiceTracker haService = context.getHAService(); String activeLabel = ""; if (haService != null) { - if (haService.isActiveStatus()) { + if (haService.isActiveMaster()) { activeLabel = "(active)"; } else { activeLabel = "(backup)"; diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/TajoResourceAllocator.java b/tajo-core/src/main/java/org/apache/tajo/worker/TajoResourceAllocator.java index 49cb1e9af6..1ef01a890f 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/TajoResourceAllocator.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/TajoResourceAllocator.java @@ -28,7 +28,6 @@ import org.apache.hadoop.yarn.event.EventHandler; import org.apache.tajo.ExecutionBlockId; import org.apache.tajo.conf.TajoConf; -import org.apache.tajo.ha.HAServiceUtil; import org.apache.tajo.ipc.ContainerProtocol; import org.apache.tajo.ipc.QueryCoordinatorProtocol; import org.apache.tajo.ipc.QueryCoordinatorProtocol.*; diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorker.java b/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorker.java index 3c55add775..e806def691 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorker.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/TajoWorker.java @@ -35,6 +35,7 @@ import org.apache.tajo.catalog.CatalogService; import org.apache.tajo.conf.TajoConf; import org.apache.tajo.service.ServiceTracker; +import org.apache.tajo.service.ServiceTrackerException; import org.apache.tajo.service.ServiceTrackerFactory; import org.apache.tajo.service.TajoMasterInfo; import org.apache.tajo.ipc.QueryCoordinatorProtocol.ClusterResourceSummary; @@ -358,6 +359,7 @@ public void serviceStart() throws Exception { startJvmPauseMonitor(); tajoMasterInfo = new TajoMasterInfo(); + if (systemConf.getBoolVar(TajoConf.ConfVars.TAJO_MASTER_HA_ENABLE)) { tajoMasterInfo.setTajoMasterAddress(serviceTracker.getUmbilicalAddress()); tajoMasterInfo.setWorkerResourceTrackerAddr(serviceTracker.getResourceTrackerAddress()); diff --git a/tajo-core/src/main/resources/webapps/admin/catalogview.jsp b/tajo-core/src/main/resources/webapps/admin/catalogview.jsp index 1ff81a6ede..3455d0b35e 100644 --- a/tajo-core/src/main/resources/webapps/admin/catalogview.jsp +++ b/tajo-core/src/main/resources/webapps/admin/catalogview.jsp @@ -30,9 +30,13 @@ <%@ page import="java.util.Collection" %> <%@ page import="java.util.List" %> <%@ page import="java.util.Map" %> -<%@ page import="org.apache.tajo.service.ServiceTracker" %> <% TajoMaster master = (TajoMaster) StaticHttpServer.getInstance().getAttribute("tajo.info.server.object"); + + String[] masterName = master.getMasterName().split(":"); + InetSocketAddress socketAddress = new InetSocketAddress(masterName[0], Integer.parseInt(masterName[1])); + String masterLabel = socketAddress.getAddress().getHostName()+ ":" + socketAddress.getPort(); + CatalogService catalog = master.getCatalog(); String catalogType = request.getParameter("type"); @@ -63,7 +67,7 @@ ServiceTracker haService = master.getContext().getHAService(); String activeLabel = ""; if (haService != null) { - if (haService.isActiveStatus()) { + if (haService.isActiveMaster()) { activeLabel = "(active)"; } else { activeLabel = "(backup)"; @@ -81,7 +85,7 @@ <%@ include file="header.jsp"%>
-

Tajo Master: <%=master.getMasterName()%> <%=activeLabel%>

+

Tajo Master: <%=masterLabel%> <%=activeLabel%>


Catalog

diff --git a/tajo-core/src/main/resources/webapps/admin/cluster.jsp b/tajo-core/src/main/resources/webapps/admin/cluster.jsp index aca1153df1..6a618b00a1 100644 --- a/tajo-core/src/main/resources/webapps/admin/cluster.jsp +++ b/tajo-core/src/main/resources/webapps/admin/cluster.jsp @@ -31,9 +31,15 @@ <%@ page import="org.apache.tajo.webapp.StaticHttpServer" %> <%@ page import="java.util.*" %> <%@ page import="org.apache.tajo.service.ServiceTracker" %> +<%@ page import="java.net.InetSocketAddress" %> <% TajoMaster master = (TajoMaster) StaticHttpServer.getInstance().getAttribute("tajo.info.server.object"); + + String[] masterName = master.getMasterName().split(":"); + InetSocketAddress socketAddress = new InetSocketAddress(masterName[0], Integer.parseInt(masterName[1])); + String masterLabel = socketAddress.getAddress().getHostName()+ ":" + socketAddress.getPort(); + Map workers = master.getContext().getResourceManager().getWorkers(); List wokerKeys = new ArrayList(workers.keySet()); Collections.sort(wokerKeys); @@ -81,7 +87,7 @@ String activeLabel = ""; if (haService != null) { - if (haService.isActiveStatus()) { + if (haService.isActiveMaster()) { activeLabel = "(active)"; } else { activeLabel = "(backup)"; @@ -114,7 +120,7 @@ <%@ include file="header.jsp"%>
-

Tajo Master: <%=master.getMasterName()%> <%=activeLabel%>

+

Tajo Master: <%=masterLabel%> <%=activeLabel%>

Live:<%=numLiveMasters%>, Dead: <%=deadMasterHtml%>, Total: <%=masters.size()%>
<% if (masters != null) { diff --git a/tajo-core/src/main/resources/webapps/admin/index.jsp b/tajo-core/src/main/resources/webapps/admin/index.jsp index 0a0558e11b..aa7917df9e 100644 --- a/tajo-core/src/main/resources/webapps/admin/index.jsp +++ b/tajo-core/src/main/resources/webapps/admin/index.jsp @@ -35,10 +35,16 @@ <%@ page import="java.util.Collection" %> <%@ page import="java.util.Date" %> <%@ page import="java.util.Map" %> +<%@ page import="java.net.InetSocketAddress" %> <%@ page import="org.apache.tajo.service.ServiceTracker" %> <% TajoMaster master = (TajoMaster) StaticHttpServer.getInstance().getAttribute("tajo.info.server.object"); + + String[] masterName = master.getMasterName().split(":"); + InetSocketAddress socketAddress = new InetSocketAddress(masterName[0], Integer.parseInt(masterName[1])); + String masterLabel = socketAddress.getAddress().getHostName()+ ":" + socketAddress.getPort(); + Map workers = master.getContext().getResourceManager().getWorkers(); Map inactiveWorkers = master.getContext().getResourceManager().getInactiveWorkers(); @@ -91,7 +97,7 @@ String activeLabel = ""; if (haService != null) { - if (haService.isActiveStatus()) { + if (haService.isActiveMaster()) { activeLabel = "(active)"; } else { activeLabel = "(backup)"; @@ -122,7 +128,7 @@ <%@ include file="header.jsp"%>
-

Tajo Master: <%=master.getMasterName()%> <%=activeLabel%>

+

Tajo Master: <%=masterLabel%> <%=activeLabel%>


Master Status

diff --git a/tajo-core/src/main/resources/webapps/admin/query.jsp b/tajo-core/src/main/resources/webapps/admin/query.jsp index 85f71763e2..894b9d0009 100644 --- a/tajo-core/src/main/resources/webapps/admin/query.jsp +++ b/tajo-core/src/main/resources/webapps/admin/query.jsp @@ -29,10 +29,15 @@ <%@ page import="java.util.*" %> <%@ page import="org.apache.tajo.util.history.HistoryReader" %> <%@ page import="org.apache.tajo.master.QueryInfo" %> +<%@ page import="java.net.InetSocketAddress" %> <% TajoMaster master = (TajoMaster) StaticHttpServer.getInstance().getAttribute("tajo.info.server.object"); + String[] masterName = master.getMasterName().split(":"); + InetSocketAddress socketAddress = new InetSocketAddress(masterName[0], Integer.parseInt(masterName[1])); + String masterLabel = socketAddress.getAddress().getHostName()+ ":" + socketAddress.getPort(); + List runningQueries = new ArrayList(master.getContext().getQueryJobManager().getSubmittedQueries()); @@ -111,7 +116,7 @@ <%@ include file="header.jsp"%>
-

Tajo Master: <%=master.getMasterName()%> <%=JSPUtil.getMasterActiveLabel(master.getContext())%>

+

Tajo Master: <%=masterLabel%> <%=JSPUtil.getMasterActiveLabel(master.getContext())%>


Running Queries

<% diff --git a/tajo-core/src/main/resources/webapps/admin/query_executor.jsp b/tajo-core/src/main/resources/webapps/admin/query_executor.jsp index a0f9a0a38d..1a58583627 100644 --- a/tajo-core/src/main/resources/webapps/admin/query_executor.jsp +++ b/tajo-core/src/main/resources/webapps/admin/query_executor.jsp @@ -22,14 +22,19 @@ <%@ page import="org.apache.tajo.service.ServiceTracker" %> <%@ page import="org.apache.tajo.webapp.StaticHttpServer" %> <%@ page import="javax.xml.ws.Service" %> +<%@ page import="java.net.InetSocketAddress" %> <% TajoMaster master = (TajoMaster) StaticHttpServer.getInstance().getAttribute("tajo.info.server.object"); + String[] masterName = master.getMasterName().split(":"); + InetSocketAddress socketAddress = new InetSocketAddress(masterName[0], Integer.parseInt(masterName[1])); + String masterLabel = socketAddress.getAddress().getHostName()+ ":" + socketAddress.getPort(); + ServiceTracker haService = master.getContext().getHAService(); String activeLabel = ""; if (haService != null) { - if (haService.isActiveStatus()) { + if (haService.isActiveMaster()) { activeLabel = "(active)"; } else { activeLabel = "(backup)"; @@ -288,7 +293,7 @@ function getPage() { <%@ include file="header.jsp"%>
-

Tajo Master: <%=master.getMasterName()%> <%=activeLabel%>

+

Tajo Master: <%=masterLabel%> <%=activeLabel%>


Query

Database : diff --git a/tajo-core/src/test/java/org/apache/tajo/ha/TestHAServiceHDFSImpl.java b/tajo-core/src/test/java/org/apache/tajo/ha/TestHAServiceHDFSImpl.java index 7c91e22c93..c8ddc03a75 100644 --- a/tajo-core/src/test/java/org/apache/tajo/ha/TestHAServiceHDFSImpl.java +++ b/tajo-core/src/test/java/org/apache/tajo/ha/TestHAServiceHDFSImpl.java @@ -30,6 +30,7 @@ import org.apache.tajo.service.ServiceTrackerFactory; import org.junit.Test; +import static junit.framework.Assert.assertTrue; import static junit.framework.TestCase.assertEquals; import static org.junit.Assert.*; @@ -68,12 +69,12 @@ public final void testAutoFailOver() throws Exception { verifySystemDirectories(fs); - Path backupMasterFile = new Path(backupPath, backupMaster.getMasterName() - .replaceAll(":", "_")); - assertTrue(fs.exists(backupMasterFile)); + assertEquals(2, fs.listStatus(activePath).length); + assertEquals(1, fs.listStatus(backupPath).length); - assertTrue(cluster.getMaster().isActiveMaster()); - assertFalse(backupMaster.isActiveMaster()); + assertTrue(fs.exists(new Path(activePath, HAConstants.ACTIVE_LOCK_FILE))); + assertTrue(fs.exists(new Path(activePath, cluster.getMaster().getMasterName().replaceAll(":", "_")))); + assertTrue(fs.exists(new Path(backupPath, backupMaster.getMasterName().replaceAll(":", "_")))); createDatabaseAndTable(); verifyDataBaseAndTable(); @@ -81,13 +82,14 @@ public final void testAutoFailOver() throws Exception { cluster.getMaster().stop(); - Thread.sleep(7000); - - assertFalse(cluster.getMaster().isActiveMaster()); - assertTrue(backupMaster.isActiveMaster()); - client = cluster.newTajoClient(); verifyDataBaseAndTable(); + + assertEquals(2, fs.listStatus(activePath).length); + assertEquals(0, fs.listStatus(backupPath).length); + + assertTrue(fs.exists(new Path(activePath, HAConstants.ACTIVE_LOCK_FILE))); + assertTrue(fs.exists(new Path(activePath, backupMaster.getMasterName().replaceAll(":", "_")))); } finally { client.close(); backupMaster.stop(); @@ -110,6 +112,7 @@ private void setConfiguration() { masterAddress + ":" + NetUtils.getFreeSocketPort()); conf.setBoolVar(TajoConf.ConfVars.TAJO_MASTER_HA_ENABLE, true); + conf.setIntVar(TajoConf.ConfVars.TAJO_MASTER_HA_MONITOR_INTERVAL, 1000); //Client API service RPC Server conf.setIntVar(TajoConf.ConfVars.MASTER_SERVICE_RPC_SERVER_WORKER_THREAD_NUM, 2); @@ -132,9 +135,6 @@ private void verifySystemDirectories(FileSystem fs) throws Exception { backupPath = new Path(haPath, TajoConstants.SYSTEM_HA_BACKUP_DIR_NAME); assertTrue(fs.exists(backupPath)); - - assertEquals(1, fs.listStatus(activePath).length); - assertEquals(1, fs.listStatus(backupPath).length); } private void createDatabaseAndTable() throws Exception { From 63c2fb6c5ab9ca5712f7af0b6d3e0e015d7e4b7d Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 15 May 2015 10:47:02 +0900 Subject: [PATCH 036/141] Add missing files in the tajo-hcatalog driver --- .../tajo-hcatalog/pom.xml | 739 +++++++++++++++ .../tajo/catalog/store/HCatalogStore.java | 891 ++++++++++++++++++ .../store/HCatalogStoreClientPool.java | 170 ++++ .../tajo/catalog/store/HCatalogUtil.java | 147 +++ .../tajo/catalog/store/TestHCatalogStore.java | 402 ++++++++ 5 files changed, 2349 insertions(+) create mode 100644 tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml create mode 100644 tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStore.java create mode 100644 tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStoreClientPool.java create mode 100644 tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogUtil.java create mode 100644 tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/test/java/org/apache/tajo/catalog/store/TestHCatalogStore.java diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml new file mode 100644 index 0000000000..fe8f34a436 --- /dev/null +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml @@ -0,0 +1,739 @@ + + + + + + tajo-project + org.apache.tajo + 0.11.0-SNAPSHOT + ../../../tajo-project + + 4.0.0 + tajo-hcatalog + jar + Tajo Catalog Drivers HCatalog + + UTF-8 + UTF-8 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.6 + 1.6 + ${project.build.sourceEncoding} + + + + org.apache.rat + apache-rat-plugin + + + verify + + check + + + + + + derby.log + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + copy-dependencies + prepare-package + + copy-dependencies + + + runtime + ${project.build.directory}/lib + false + false + true + + + + + + org.apache.maven.plugins + maven-surefire-report-plugin + + + + + + + org.apache.tajo + tajo-common + + + org.apache.tajo + tajo-catalog-common + + + org.apache.tajo + tajo-catalog-client + + + org.apache.tajo + tajo-catalog-server + + + org.apache.tajo + tajo-rpc + + + org.apache.tajo + tajo-storage-common + + + junit + junit + test + + + org.apache.thrift + libfb303 + 0.9.0 + provided + + + org.apache.thrift + libthrift + 0.9.0 + provided + + + org.apache.hadoop + hadoop-mapreduce-client-core + ${hadoop.version} + provided + + + org.apache.hadoop + hadoop-common + ${hadoop.version} + provided + + + + + + hcatalog-0.12.0 + + false + + + 0.12.0 + 1.5.0 + 2.1.0 + + + + org.apache.hive + hive-exec + ${hive.version} + provided + + + org.apache.hive + hive-common + + + org.apache.hive + hive-contrib + + + org.apache.hive + hive-hbase-handler + + + org.apache.hive + hive-metastore + + + org.apache.hive + hive-serde + + + org.apache.hive + hive-shims + + + org.apache.hive + hive-testutils + + + org.apache.thrift + libfb303 + + + org.apache.thrift + libthrift + + + com.jolbox + bonecp + + + com.google.protobuf + protobuf-java + + + + + org.apache.hive + hive-metastore + ${hive.version} + provided + + + org.apache.hive + hive-common + + + org.apache.hive + hive-serde + + + org.apache.hive + hive-shimss + + + org.apache.thrift + libfb303 + + + org.apache.thrift + libthrift + + + com.jolbox + bonecp + + + + + org.apache.hive + hive-cli + ${hive.version} + provided + + + org.apache.hive + hive-common + + + org.apache.hive + hive-exec + + + org.apache.hive + hive-metastore + + + org.apache.hive + hive-serde + + + org.apache.hive + hive-service + + + org.apache.hive + hive-shims + + + com.jolbox + bonecp + + + jline + jline + + + + + org.apache.hive.hcatalog + hcatalog-core + ${hive.version} + + + org.apache.hive + hive-cli + + + org.apache.hive + hive-common + + + org.apache.hive + hive-exec + + + org.apache.hive + hive-metastore + + + org.apache.hive + hive-serde + + + org.apache.hive + hive-service + + + org.apache.hive + hive-shims + + + com.jolbox + bonecp + + + + + com.twitter + parquet-hive-bundle + ${parquet.version} + + + + + hcatalog-0.13.0 + + false + + + 0.13.0 + 1.5.0 + 2.1.0 + + + + org.apache.hive + hive-exec + ${hive.version} + provided + + + org.apache.hive + hive-common + + + org.apache.hive + hive-contrib + + + org.apache.hive + hive-hbase-handler + + + org.apache.hive + hive-metastore + + + org.apache.hive + hive-serde + + + org.apache.hive + hive-shims + + + org.apache.hive + hive-testutils + + + org.apache.thrift + libfb303 + + + org.apache.thrift + libthrift + + + com.jolbox + bonecp + + + com.google.protobuf + protobuf-java + + + + + org.apache.hive + hive-metastore + ${hive.version} + provided + + + org.apache.hive + hive-common + + + org.apache.hive + hive-serde + + + org.apache.hive + hive-shimss + + + org.apache.thrift + libfb303 + + + org.apache.thrift + libthrift + + + com.jolbox + bonecp + + + + + org.apache.hive + hive-cli + ${hive.version} + provided + + + org.apache.hive + hive-common + + + org.apache.hive + hive-exec + + + org.apache.hive + hive-metastore + + + org.apache.hive + hive-serde + + + org.apache.hive + hive-service + + + org.apache.hive + hive-shims + + + com.jolbox + bonecp + + + jline + jline + + + + + org.apache.hive.hcatalog + hive-hcatalog-core + ${hive.version} + + + org.apache.hive + hive-cli + + + org.apache.hive + hive-common + + + org.apache.hive + hive-exec + + + org.apache.hive + hive-metastore + + + com.google.guava + guava + + + org.codehaus.jackson + jackson-mapper-asl + + + + + com.twitter + parquet-hive-bundle + ${parquet.version} + + + + + hcatalog-0.13.1 + + false + + + 0.13.1 + 1.5.0 + 2.1.0 + + + + org.apache.hive + hive-exec + ${hive.version} + provided + + + org.apache.hive + hive-common + + + org.apache.hive + hive-contrib + + + org.apache.hive + hive-hbase-handler + + + org.apache.hive + hive-metastore + + + org.apache.hive + hive-serde + + + org.apache.hive + hive-shims + + + org.apache.hive + hive-testutils + + + org.apache.thrift + libfb303 + + + org.apache.thrift + libthrift + + + com.jolbox + bonecp + + + com.google.protobuf + protobuf-java + + + + + org.apache.hive + hive-metastore + ${hive.version} + provided + + + org.apache.hive + hive-common + + + org.apache.hive + hive-serde + + + org.apache.hive + hive-shimss + + + org.apache.thrift + libfb303 + + + org.apache.thrift + libthrift + + + com.jolbox + bonecp + + + + + org.apache.hive + hive-cli + ${hive.version} + provided + + + org.apache.hive + hive-common + + + org.apache.hive + hive-exec + + + org.apache.hive + hive-metastore + + + org.apache.hive + hive-serde + + + org.apache.hive + hive-service + + + org.apache.hive + hive-shims + + + com.jolbox + bonecp + + + jline + jline + + + + + org.apache.hive.hcatalog + hive-hcatalog-core + ${hive.version} + + + org.apache.hive + hive-cli + + + org.apache.hive + hive-common + + + org.apache.hive + hive-exec + + + org.apache.hive + hive-metastore + + + com.google.guava + guava + + + org.codehaus.jackson + jackson-mapper-asl + + + + + com.twitter + parquet-hive-bundle + ${parquet.version} + + + + + docs + + false + + + + + org.apache.maven.plugins + maven-javadoc-plugin + + + + module-javadocs + package + + jar + + + ${project.build.directory} + + + + + + + + + src + + false + + + + + org.apache.maven.plugins + maven-source-plugin + + + + hadoop-java-sources + package + + jar-no-fork + + + + + + + + + + + + + org.apache.maven.plugins + maven-surefire-report-plugin + + + + + diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStore.java new file mode 100644 index 0000000000..2c3fc6ac1a --- /dev/null +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStore.java @@ -0,0 +1,891 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.catalog.store; + +import com.google.common.collect.Lists; + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.metastore.TableType; +import org.apache.hadoop.hive.metastore.api.*; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe; +import org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe; +import org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe; +import org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe; +import org.apache.hcatalog.common.HCatUtil; +import org.apache.hcatalog.data.schema.HCatFieldSchema; +import org.apache.hcatalog.data.schema.HCatSchema; +import org.apache.tajo.TajoConstants; +import org.apache.tajo.catalog.*; +import org.apache.tajo.catalog.exception.*; +import org.apache.tajo.catalog.partition.PartitionMethodDesc; +import org.apache.tajo.catalog.proto.CatalogProtos; +import org.apache.tajo.catalog.proto.CatalogProtos.ColumnProto; +import org.apache.tajo.catalog.proto.CatalogProtos.DatabaseProto; +import org.apache.tajo.catalog.proto.CatalogProtos.IndexProto; +import org.apache.tajo.catalog.proto.CatalogProtos.TableDescriptorProto; +import org.apache.tajo.catalog.proto.CatalogProtos.TableOptionProto; +import org.apache.tajo.catalog.proto.CatalogProtos.TablePartitionProto; +import org.apache.tajo.catalog.proto.CatalogProtos.TableStatsProto; +import org.apache.tajo.catalog.proto.CatalogProtos.TablespaceProto; +import org.apache.tajo.catalog.statistics.TableStats; +import org.apache.tajo.common.TajoDataTypes; +import org.apache.tajo.common.exception.NotImplementedException; +import org.apache.tajo.conf.TajoConf; +import org.apache.tajo.exception.InternalException; +import org.apache.tajo.storage.StorageConstants; +import org.apache.tajo.util.KeyValueSet; +import org.apache.thrift.TException; + +import java.io.IOException; +import java.util.*; + +import static org.apache.tajo.catalog.proto.CatalogProtos.PartitionType; + +public class HCatalogStore extends CatalogConstants implements CatalogStore { + protected final Log LOG = LogFactory.getLog(getClass()); + + private static String HIVE_WAREHOUSE_DIR_CONF_KEY = "hive.metastore.warehouse.dir"; + + protected Configuration conf; + private static final int CLIENT_POOL_SIZE = 2; + private final HCatalogStoreClientPool clientPool; + private final String defaultTableSpaceUri; + + public HCatalogStore(final Configuration conf) throws InternalException { + if (!(conf instanceof TajoConf)) { + throw new CatalogException("Invalid Configuration Type:" + conf.getClass().getSimpleName()); + } + this.conf = conf; + this.defaultTableSpaceUri = TajoConf.getWarehouseDir((TajoConf) conf).toString(); + this.clientPool = new HCatalogStoreClientPool(CLIENT_POOL_SIZE, conf); + } + + @Override + public boolean existTable(final String databaseName, final String tableName) throws CatalogException { + boolean exist = false; + org.apache.hadoop.hive.ql.metadata.Table table; + HCatalogStoreClientPool.HCatalogStoreClient client = null; + + // get table + try { + client = clientPool.getClient(); + table = HCatalogUtil.getTable(client.getHiveClient(), databaseName, tableName); + if (table != null) { + exist = true; + } + } catch (NoSuchObjectException nsoe) { + exist = false; + } catch (Exception e) { + throw new CatalogException(e); + } finally { + if (client != null) { + client.release(); + } + } + + return exist; + } + + @Override + public final CatalogProtos.TableDescProto getTable(String databaseName, final String tableName) throws CatalogException { + org.apache.hadoop.hive.ql.metadata.Table table = null; + HCatalogStoreClientPool.HCatalogStoreClient client = null; + Path path = null; + CatalogProtos.StoreType storeType = null; + org.apache.tajo.catalog.Schema schema = null; + KeyValueSet options = null; + TableStats stats = null; + PartitionMethodDesc partitions = null; + + ////////////////////////////////// + // set tajo table schema. + ////////////////////////////////// + try { + // get hive table schema + try { + client = clientPool.getClient(); + table = HCatalogUtil.getTable(client.getHiveClient(), databaseName, tableName); + path = table.getPath(); + } catch (NoSuchObjectException nsoe) { + throw new CatalogException("Table not found. - tableName:" + tableName, nsoe); + } catch (Exception e) { + throw new CatalogException(e); + } + + // convert hcatalog field schema into tajo field schema. + schema = new org.apache.tajo.catalog.Schema(); + HCatSchema tableSchema = null; + + try { + tableSchema = HCatUtil.getTableSchemaWithPtnCols(table); + } catch (IOException ioe) { + throw new CatalogException("Fail to get table schema. - tableName:" + tableName, ioe); + } + List fieldSchemaList = tableSchema.getFields(); + boolean isPartitionKey = false; + for (HCatFieldSchema eachField : fieldSchemaList) { + isPartitionKey = false; + + if (table.getPartitionKeys() != null) { + for (FieldSchema partitionKey : table.getPartitionKeys()) { + if (partitionKey.getName().equals(eachField.getName())) { + isPartitionKey = true; + } + } + } + + if (!isPartitionKey) { + String fieldName = databaseName + CatalogConstants.IDENTIFIER_DELIMITER + tableName + + CatalogConstants.IDENTIFIER_DELIMITER + eachField.getName(); + TajoDataTypes.Type dataType = HCatalogUtil.getTajoFieldType(eachField.getType().toString()); + schema.addColumn(fieldName, dataType); + } + } + + // validate field schema. + try { + HCatalogUtil.validateHCatTableAndTajoSchema(tableSchema); + } catch (Exception e) { + throw new CatalogException("HCatalog cannot support schema. - schema:" + tableSchema.toString(), e); + } + + stats = new TableStats(); + options = new KeyValueSet(); + options.putAll(table.getParameters()); + options.remove("EXTERNAL"); + + Properties properties = table.getMetadata(); + if (properties != null) { + // set field delimiter + String fieldDelimiter = "", nullFormat = ""; + if (properties.getProperty(serdeConstants.FIELD_DELIM) != null) { + fieldDelimiter = properties.getProperty(serdeConstants.FIELD_DELIM); + } else { + // if hive table used default row format delimiter, Properties doesn't have it. + // So, Tajo must set as follows: + fieldDelimiter = "\u0001"; + } + + // set null format + if (properties.getProperty(serdeConstants.SERIALIZATION_NULL_FORMAT) != null) { + nullFormat = properties.getProperty(serdeConstants.SERIALIZATION_NULL_FORMAT); + } else { + nullFormat = "\\N"; + } + options.remove(serdeConstants.SERIALIZATION_NULL_FORMAT); + + // set file output format + String fileOutputformat = properties.getProperty(hive_metastoreConstants.FILE_OUTPUT_FORMAT); + storeType = CatalogUtil.getStoreType(HCatalogUtil.getStoreType(fileOutputformat)); + + if (storeType.equals(CatalogProtos.StoreType.TEXTFILE)) { + options.set(StorageConstants.TEXT_DELIMITER, StringEscapeUtils.escapeJava(fieldDelimiter)); + options.set(StorageConstants.TEXT_NULL, StringEscapeUtils.escapeJava(nullFormat)); + } else if (storeType.equals(CatalogProtos.StoreType.RCFILE)) { + options.set(StorageConstants.RCFILE_NULL, StringEscapeUtils.escapeJava(nullFormat)); + String serde = properties.getProperty(serdeConstants.SERIALIZATION_LIB); + if (LazyBinaryColumnarSerDe.class.getName().equals(serde)) { + options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE); + } else if (ColumnarSerDe.class.getName().equals(serde)) { + options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE); + } + } else if (storeType.equals(CatalogProtos.StoreType.SEQUENCEFILE) ) { + options.set(StorageConstants.SEQUENCEFILE_DELIMITER, StringEscapeUtils.escapeJava(fieldDelimiter)); + options.set(StorageConstants.SEQUENCEFILE_NULL, StringEscapeUtils.escapeJava(nullFormat)); + String serde = properties.getProperty(serdeConstants.SERIALIZATION_LIB); + if (LazyBinarySerDe.class.getName().equals(serde)) { + options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE); + } else if (LazySimpleSerDe.class.getName().equals(serde)) { + options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE); + } + } + + // set data size + long totalSize = 0; + if (properties.getProperty("totalSize") != null) { + totalSize = Long.parseLong(properties.getProperty("totalSize")); + } else { + try { + FileSystem fs = path.getFileSystem(conf); + if (fs.exists(path)) { + totalSize = fs.getContentSummary(path).getLength(); + } + } catch (IOException ioe) { + throw new CatalogException("Fail to get path. - path:" + path.toString(), ioe); + } + } + stats.setNumBytes(totalSize); + } + + // set partition keys + List partitionKeys = table.getPartitionKeys(); + + if (null != partitionKeys) { + org.apache.tajo.catalog.Schema expressionSchema = new org.apache.tajo.catalog.Schema(); + StringBuilder sb = new StringBuilder(); + if (partitionKeys.size() > 0) { + for (int i = 0; i < partitionKeys.size(); i++) { + FieldSchema fieldSchema = partitionKeys.get(i); + TajoDataTypes.Type dataType = HCatalogUtil.getTajoFieldType(fieldSchema.getType().toString()); + String fieldName = databaseName + CatalogConstants.IDENTIFIER_DELIMITER + tableName + + CatalogConstants.IDENTIFIER_DELIMITER + fieldSchema.getName(); + expressionSchema.addColumn(new Column(fieldName, dataType)); + if (i > 0) { + sb.append(","); + } + sb.append(fieldSchema.getName()); + } + partitions = new PartitionMethodDesc( + databaseName, + tableName, + PartitionType.COLUMN, + sb.toString(), + expressionSchema); + } + } + } finally { + if(client != null) client.release(); + } + TableMeta meta = new TableMeta(storeType, options); + TableDesc tableDesc = new TableDesc(databaseName + "." + tableName, schema, meta, path.toUri()); + if (table.getTableType().equals(TableType.EXTERNAL_TABLE)) { + tableDesc.setExternal(true); + } + if (stats != null) { + tableDesc.setStats(stats); + } + if (partitions != null) { + tableDesc.setPartitionMethod(partitions); + } + return tableDesc.getProto(); + } + + + private TajoDataTypes.Type getDataType(final String typeStr) { + try { + return Enum.valueOf(TajoDataTypes.Type.class, typeStr); + } catch (IllegalArgumentException iae) { + LOG.error("Cannot find a matched type against from '" + typeStr + "'"); + return null; + } + } + + @Override + public final List getAllTableNames(String databaseName) throws CatalogException { + HCatalogStoreClientPool.HCatalogStoreClient client = null; + + try { + client = clientPool.getClient(); + return client.getHiveClient().getAllTables(databaseName); + } catch (TException e) { + throw new CatalogException(e); + } finally { + if(client != null) client.release(); + } + } + + @Override + public void createTablespace(String spaceName, String spaceUri) throws CatalogException { + // SKIP + } + + @Override + public boolean existTablespace(String spaceName) throws CatalogException { + // SKIP + return spaceName.equals(TajoConstants.DEFAULT_TABLESPACE_NAME); + } + + @Override + public void dropTablespace(String spaceName) throws CatalogException { + // SKIP + } + + @Override + public Collection getAllTablespaceNames() throws CatalogException { + return Lists.newArrayList(TajoConstants.DEFAULT_TABLESPACE_NAME); + } + + @Override + public TablespaceProto getTablespace(String spaceName) throws CatalogException { + if (spaceName.equals(TajoConstants.DEFAULT_TABLESPACE_NAME)) { + TablespaceProto.Builder builder = TablespaceProto.newBuilder(); + builder.setSpaceName(TajoConstants.DEFAULT_TABLESPACE_NAME); + builder.setUri(defaultTableSpaceUri); + return builder.build(); + } else { + throw new CatalogException("tablespace concept is not supported in HCatalogStore"); + } + } + + @Override + public void updateTableStats(CatalogProtos.UpdateTableStatsProto statsProto) throws + CatalogException { + // TODO - not implemented yet + } + + @Override + public void alterTablespace(CatalogProtos.AlterTablespaceProto alterProto) throws CatalogException { + throw new CatalogException("tablespace concept is not supported in HCatalogStore"); + } + + @Override + public void createDatabase(String databaseName, String tablespaceName) throws CatalogException { + HCatalogStoreClientPool.HCatalogStoreClient client = null; + + try { + Database database = new Database( + databaseName, + "", + defaultTableSpaceUri + "/" + databaseName, + new HashMap()); + client = clientPool.getClient(); + client.getHiveClient().createDatabase(database); + } catch (AlreadyExistsException e) { + throw new AlreadyExistsDatabaseException(databaseName); + } catch (Throwable t) { + throw new CatalogException(t); + } finally { + if (client != null) { + client.release(); + } + } + } + + @Override + public boolean existDatabase(String databaseName) throws CatalogException { + HCatalogStoreClientPool.HCatalogStoreClient client = null; + + try { + client = clientPool.getClient(); + List databaseNames = client.getHiveClient().getAllDatabases(); + return databaseNames.contains(databaseName); + } catch (Throwable t) { + throw new CatalogException(t); + } finally { + if (client != null) { + client.release(); + } + } + } + + @Override + public void dropDatabase(String databaseName) throws CatalogException { + HCatalogStoreClientPool.HCatalogStoreClient client = null; + + try { + client = clientPool.getClient(); + client.getHiveClient().dropDatabase(databaseName); + } catch (NoSuchObjectException e) { + throw new NoSuchDatabaseException(databaseName); + } catch (Throwable t) { + throw new CatalogException(databaseName); + } finally { + if (client != null) { + client.release(); + } + } + } + + @Override + public Collection getAllDatabaseNames() throws CatalogException { + HCatalogStoreClientPool.HCatalogStoreClient client = null; + + try { + client = clientPool.getClient(); + return client.getHiveClient().getAllDatabases(); + } catch (TException e) { + throw new CatalogException(e); + } finally { + if (client != null) { + client.release(); + } + } + } + + @Override + public final void createTable(final CatalogProtos.TableDescProto tableDescProto) throws CatalogException { + HCatalogStoreClientPool.HCatalogStoreClient client = null; + + TableDesc tableDesc = new TableDesc(tableDescProto); + String[] splitted = CatalogUtil.splitFQTableName(tableDesc.getName()); + String databaseName = splitted[0]; + String tableName = splitted[1]; + + try { + client = clientPool.getClient(); + + org.apache.hadoop.hive.metastore.api.Table table = new org.apache.hadoop.hive.metastore.api.Table(); + table.setDbName(databaseName); + table.setTableName(tableName); + table.setParameters(new HashMap(tableDesc.getMeta().getOptions().getAllKeyValus())); + // TODO: set owner + //table.setOwner(); + + StorageDescriptor sd = new StorageDescriptor(); + sd.setSerdeInfo(new SerDeInfo()); + sd.getSerdeInfo().setParameters(new HashMap()); + sd.getSerdeInfo().setName(table.getTableName()); + + // if tajo set location method, thrift client make exception as follows: + // Caused by: MetaException(message:java.lang.NullPointerException) + // If you want to modify table path, you have to modify on Hive cli. + if (tableDesc.isExternal()) { + table.setTableType(TableType.EXTERNAL_TABLE.name()); + table.putToParameters("EXTERNAL", "TRUE"); + + Path tablePath = new Path(tableDesc.getPath()); + FileSystem fs = tablePath.getFileSystem(conf); + if (fs.isFile(tablePath)) { + LOG.warn("A table path is a file, but HCatalog does not allow a file path."); + sd.setLocation(tablePath.getParent().toString()); + } else { + sd.setLocation(tablePath.toString()); + } + } + + // set column information + List columns = tableDesc.getSchema().getColumns(); + ArrayList cols = new ArrayList(columns.size()); + + for (Column eachField : columns) { + cols.add(new FieldSchema(eachField.getSimpleName(), + HCatalogUtil.getHiveFieldType(eachField.getDataType()), "")); + } + sd.setCols(cols); + + // set partition keys + if (tableDesc.hasPartition() && tableDesc.getPartitionMethod().getPartitionType().equals(PartitionType.COLUMN)) { + List partitionKeys = new ArrayList(); + for (Column eachPartitionKey : tableDesc.getPartitionMethod().getExpressionSchema().getColumns()) { + partitionKeys.add(new FieldSchema(eachPartitionKey.getSimpleName(), + HCatalogUtil.getHiveFieldType(eachPartitionKey.getDataType()), "")); + } + table.setPartitionKeys(partitionKeys); + } + + if (tableDesc.getMeta().getStoreType().equals(CatalogProtos.StoreType.RCFILE)) { + String serde = tableDesc.getMeta().getOption(StorageConstants.RCFILE_SERDE); + sd.setInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat.class.getName()); + sd.setOutputFormat(org.apache.hadoop.hive.ql.io.RCFileOutputFormat.class.getName()); + if (StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)) { + sd.getSerdeInfo().setSerializationLib(org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe.class.getName()); + } else { + sd.getSerdeInfo().setSerializationLib( + org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe.class.getName()); + } + + if (tableDesc.getMeta().getOptions().containsKey(StorageConstants.RCFILE_NULL)) { + table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT, + StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.RCFILE_NULL))); + } + } else if (tableDesc.getMeta().getStoreType().equals(CatalogProtos.StoreType.CSV) + || tableDesc.getMeta().getStoreType().equals(CatalogProtos.StoreType.TEXTFILE)) { + sd.getSerdeInfo().setSerializationLib(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.class.getName()); + sd.setInputFormat(org.apache.hadoop.mapred.TextInputFormat.class.getName()); + sd.setOutputFormat(org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat.class.getName()); + + String fieldDelimiter = tableDesc.getMeta().getOption(StorageConstants.TEXT_DELIMITER, + StorageConstants.DEFAULT_FIELD_DELIMITER); + + // User can use an unicode for filed delimiter such as \u0001, \001. + // In this case, java console will convert this value into "\\u001". + // And hive will un-espace this value again. + // As a result, user can use right field delimiter. + // So, we have to un-escape this value. + sd.getSerdeInfo().putToParameters(serdeConstants.SERIALIZATION_FORMAT, + StringEscapeUtils.unescapeJava(fieldDelimiter)); + sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, + StringEscapeUtils.unescapeJava(fieldDelimiter)); + table.getParameters().remove(StorageConstants.TEXT_DELIMITER); + + if (tableDesc.getMeta().containsOption(StorageConstants.TEXT_NULL)) { + table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT, + StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.TEXT_NULL))); + table.getParameters().remove(StorageConstants.TEXT_NULL); + } + } else if (tableDesc.getMeta().getStoreType().equals(CatalogProtos.StoreType.SEQUENCEFILE)) { + String serde = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_SERDE); + sd.setInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat.class.getName()); + sd.setOutputFormat(org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat.class.getName()); + + if (StorageConstants.DEFAULT_TEXT_SERDE.equals(serde)) { + sd.getSerdeInfo().setSerializationLib(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe.class.getName()); + + String fieldDelimiter = tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_DELIMITER, + StorageConstants.DEFAULT_FIELD_DELIMITER); + + // User can use an unicode for filed delimiter such as \u0001, \001. + // In this case, java console will convert this value into "\\u001". + // And hive will un-espace this value again. + // As a result, user can use right field delimiter. + // So, we have to un-escape this value. + sd.getSerdeInfo().putToParameters(serdeConstants.SERIALIZATION_FORMAT, + StringEscapeUtils.unescapeJava(fieldDelimiter)); + sd.getSerdeInfo().putToParameters(serdeConstants.FIELD_DELIM, + StringEscapeUtils.unescapeJava(fieldDelimiter)); + table.getParameters().remove(StorageConstants.SEQUENCEFILE_DELIMITER); + } else { + sd.getSerdeInfo().setSerializationLib(org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe.class.getName()); + } + + if (tableDesc.getMeta().containsOption(StorageConstants.SEQUENCEFILE_NULL)) { + table.putToParameters(serdeConstants.SERIALIZATION_NULL_FORMAT, + StringEscapeUtils.unescapeJava(tableDesc.getMeta().getOption(StorageConstants.SEQUENCEFILE_NULL))); + table.getParameters().remove(StorageConstants.SEQUENCEFILE_NULL); + } + } else { + if (tableDesc.getMeta().getStoreType().equals(CatalogProtos.StoreType.PARQUET)) { + sd.setInputFormat(parquet.hive.DeprecatedParquetInputFormat.class.getName()); + sd.setOutputFormat(parquet.hive.DeprecatedParquetOutputFormat.class.getName()); + sd.getSerdeInfo().setSerializationLib(parquet.hive.serde.ParquetHiveSerDe.class.getName()); + } else { + throw new CatalogException(new NotImplementedException(tableDesc.getMeta().getStoreType + ().name())); + } + } + + sd.setSortCols(new ArrayList()); + + table.setSd(sd); + client.getHiveClient().createTable(table); + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new CatalogException(e); + } finally { + if(client != null) client.release(); + } + } + + @Override + public final void dropTable(String databaseName, final String tableName) throws CatalogException { + HCatalogStoreClientPool.HCatalogStoreClient client = null; + + try { + client = clientPool.getClient(); + client.getHiveClient().dropTable(databaseName, tableName, false, false); + } catch (NoSuchObjectException nsoe) { + } catch (Exception e) { + throw new CatalogException(e); + } finally { + if (client != null) { + client.release(); + } + } + } + + + @Override + public void alterTable(final CatalogProtos.AlterTableDescProto alterTableDescProto) throws CatalogException { + final String[] split = CatalogUtil.splitFQTableName(alterTableDescProto.getTableName()); + + if (split.length == 1) { + throw new IllegalArgumentException("alterTable() requires a qualified table name, but it is \"" + + alterTableDescProto.getTableName() + "\"."); + } + + final String databaseName = split[0]; + final String tableName = split[1]; + + + switch (alterTableDescProto.getAlterTableType()) { + case RENAME_TABLE: + if (existTable(databaseName,alterTableDescProto.getNewTableName().toLowerCase())) { + throw new AlreadyExistsTableException(alterTableDescProto.getNewTableName()); + } + renameTable(databaseName, tableName, alterTableDescProto.getNewTableName().toLowerCase()); + break; + case RENAME_COLUMN: + if (existColumn(databaseName,tableName, alterTableDescProto.getAlterColumnName().getNewColumnName())) { + throw new ColumnNameAlreadyExistException(alterTableDescProto.getAlterColumnName().getNewColumnName()); + } + renameColumn(databaseName, tableName, alterTableDescProto.getAlterColumnName()); + break; + case ADD_COLUMN: + if (existColumn(databaseName,tableName, alterTableDescProto.getAddColumn().getName())) { + throw new ColumnNameAlreadyExistException(alterTableDescProto.getAddColumn().getName()); + } + addNewColumn(databaseName, tableName, alterTableDescProto.getAddColumn()); + break; + default: + //TODO + } + } + + + private void renameTable(String databaseName, String tableName, String newTableName) { + HCatalogStoreClientPool.HCatalogStoreClient client = null; + try { + client = clientPool.getClient(); + Table newTable = client.getHiveClient().getTable(databaseName, tableName); + newTable.setTableName(newTableName); + client.getHiveClient().alter_table(databaseName, tableName, newTable); + + } catch (NoSuchObjectException nsoe) { + } catch (Exception e) { + throw new CatalogException(e); + } finally { + if (client != null) { + client.release(); + } + } + } + + private void renameColumn(String databaseName, String tableName, CatalogProtos.AlterColumnProto alterColumnProto) { + HCatalogStoreClientPool.HCatalogStoreClient client = null; + try { + + client = clientPool.getClient(); + Table table = client.getHiveClient().getTable(databaseName, tableName); + List columns = table.getSd().getCols(); + + for (final FieldSchema currentColumn : columns) { + if (currentColumn.getName().equalsIgnoreCase(alterColumnProto.getOldColumnName())) { + currentColumn.setName(alterColumnProto.getNewColumnName()); + } + } + client.getHiveClient().alter_table(databaseName, tableName, table); + + } catch (NoSuchObjectException nsoe) { + } catch (Exception e) { + throw new CatalogException(e); + } finally { + if (client != null) { + client.release(); + } + } + } + + + private void addNewColumn(String databaseName, String tableName, CatalogProtos.ColumnProto columnProto) { + HCatalogStoreClientPool.HCatalogStoreClient client = null; + try { + + client = clientPool.getClient(); + Table table = client.getHiveClient().getTable(databaseName, tableName); + List columns = table.getSd().getCols(); + columns.add(new FieldSchema(columnProto.getName(), + HCatalogUtil.getHiveFieldType(columnProto.getDataType()), "")); + client.getHiveClient().alter_table(databaseName, tableName, table); + + + } catch (NoSuchObjectException nsoe) { + } catch (Exception e) { + throw new CatalogException(e); + } finally { + if (client != null) { + client.release(); + } + } + } + + @Override + public void addPartitionMethod(CatalogProtos.PartitionMethodProto partitionMethodProto) throws CatalogException { + // TODO - not implemented yet + } + + @Override + public CatalogProtos.PartitionMethodProto getPartitionMethod(String databaseName, String tableName) + throws CatalogException { + return null; // TODO - not implemented yet + } + + @Override + public boolean existPartitionMethod(String databaseName, String tableName) throws CatalogException { + return false; // TODO - not implemented yet + } + + @Override + public void dropPartitionMethod(String databaseName, String tableName) throws CatalogException { + // TODO - not implemented yet + } + + @Override + public void addPartitions(CatalogProtos.PartitionsProto partitionsProto) throws CatalogException { + // TODO - not implemented yet + } + + @Override + public void addPartition(String databaseName, String tableName, CatalogProtos.PartitionDescProto partitionDescProto) throws CatalogException { + + } + + @Override + public CatalogProtos.PartitionsProto getPartitions(String tableName) throws CatalogException { + return null; // TODO - not implemented yet + } + + @Override + public CatalogProtos.PartitionDescProto getPartition(String partitionName) throws CatalogException { + return null; // TODO - not implemented yet + } + + @Override + public void delPartition(String partitionName) throws CatalogException { + // TODO - not implemented yet + } + + @Override + public void dropPartitions(String tableName) throws CatalogException { + + } + + + @Override + public final void addFunction(final FunctionDesc func) throws CatalogException { + // TODO - not implemented yet + } + + @Override + public final void deleteFunction(final FunctionDesc func) throws CatalogException { + // TODO - not implemented yet + } + + @Override + public final void existFunction(final FunctionDesc func) throws CatalogException { + // TODO - not implemented yet + } + + @Override + public final List getAllFunctionNames() throws CatalogException { + // TODO - not implemented yet + return null; + } + + @Override + public void dropIndex(String databaseName, String indexName) throws CatalogException { + // TODO - not implemented yet + } + + @Override + public boolean existIndexByName(String databaseName, String indexName) throws CatalogException { + // TODO - not implemented yet + return false; + } + + @Override + public CatalogProtos.IndexDescProto[] getIndexes(String databaseName, String tableName) throws CatalogException { + // TODO - not implemented yet + return null; + } + + @Override + public void createIndex(CatalogProtos.IndexDescProto proto) throws CatalogException { + // TODO - not implemented yet + } + + @Override + public CatalogProtos.IndexDescProto getIndexByName(String databaseName, String indexName) throws CatalogException { + // TODO - not implemented yet + return null; + } + + @Override + public CatalogProtos.IndexDescProto getIndexByColumn(String databaseName, String tableName, String columnName) + throws CatalogException { + // TODO - not implemented yet + return null; + } + + @Override + public boolean existIndexByColumn(String databaseName, String tableName, String columnName) throws CatalogException { + // TODO - not implemented yet + return false; + } + + @Override + public final void close() { + clientPool.close(); + } + + private boolean existColumn(final String databaseName ,final String tableName , final String columnName) throws CatalogException { + boolean exist = false; + HCatalogStoreClientPool.HCatalogStoreClient client = null; + + try { + + client = clientPool.getClient(); + Table table = client.getHiveClient().getTable(databaseName, tableName); + List columns = table.getSd().getCols(); + + for (final FieldSchema currentColumn : columns) { + if (currentColumn.getName().equalsIgnoreCase(columnName)) { + exist = true; + } + } + client.getHiveClient().alter_table(databaseName, tableName, table); + + } catch (NoSuchObjectException nsoe) { + } catch (Exception e) { + throw new CatalogException(e); + } finally { + if (client != null) { + client.release(); + } + } + + return exist; + } + + @Override + public List getAllColumns() throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List getAllDatabases() throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List getAllIndexes() throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List getAllPartitions() throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List getAllTableOptions() throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List getAllTableStats() throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List getAllTables() throws CatalogException { + throw new UnsupportedOperationException(); + } + + @Override + public List getTablespaces() throws CatalogException { + throw new UnsupportedOperationException(); + } +} diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStoreClientPool.java b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStoreClientPool.java new file mode 100644 index 0000000000..8ccb100b01 --- /dev/null +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogStoreClientPool.java @@ -0,0 +1,170 @@ +// Copyright 2012 Cloudera Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package org.apache.tajo.catalog.store; + + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.*; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.log4j.Logger; + +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.atomic.AtomicBoolean; + +/** + * Manages a pool of HiveMetaStoreClient connections. If the connection pool is empty + * a new client is created and added to the pool. There is no size limit. + */ +public class HCatalogStoreClientPool { + private static final Logger LOG = Logger.getLogger(HCatalogStoreClientPool.class); + private final ConcurrentLinkedQueue clientPool = + new ConcurrentLinkedQueue(); + private AtomicBoolean poolClosed = new AtomicBoolean(false); + private HiveConf hiveConf; + + /** + * A wrapper around the HiveMetaStoreClient that manages interactions with the + * connection pool. + */ + public class HCatalogStoreClient { + private final IMetaStoreClient hiveClient; + public AtomicBoolean isInUse = new AtomicBoolean(false); + + private HCatalogStoreClient(HiveConf hiveConf) { + try { + HiveMetaHookLoader hookLoader = new HiveMetaHookLoader() { + @Override + public HiveMetaHook getHook(Table table) throws MetaException { + /* metadata hook implementation, or null if this + * storage handler does not need any metadata notifications + */ + return null; + } + }; + + this.hiveClient = RetryingMetaStoreClient.getProxy(hiveConf, hookLoader, HiveMetaStoreClient.class.getName()); + clientPool.add(this); + LOG.info("MetaStoreClient created (size = " + clientPool.size() + ")"); + } catch (Exception e) { + // Turn in to an unchecked exception + throw new IllegalStateException(e); + } + } + + /** + * Returns the internal HiveMetaStoreClient object. + */ + public IMetaStoreClient getHiveClient() { + return hiveClient; + } + + /** + * Returns this client back to the connection pool. If the connection pool has been + * closed, just close the Hive client connection. + */ + public synchronized void release() { + if(!this.isInUse.getAndSet(false)){ + return; + } + // Ensure the connection isn't returned to the pool if the pool has been closed. + // This lock is needed to ensure proper behavior when a thread reads poolClosed + // is false, but a call to pool.close() comes in immediately afterward. + if (poolClosed.get()) { + this.getHiveClient().close(); + } else { + clientPool.add(this); + } + } + + // Marks this client as in use + private void markInUse() { + isInUse.set(true); + } + } + + public HCatalogStoreClientPool(int initialSize) { + this(initialSize, new HiveConf(HCatalogStoreClientPool.class)); + } + + public HCatalogStoreClientPool(int initialSize, HiveConf hiveConf) { + this.hiveConf = hiveConf; + addClients(initialSize); + } + + public HCatalogStoreClientPool(int initialSize, Configuration conf) { + this.hiveConf = new HiveConf(); + setParameters(conf); + addClients(initialSize); + } + + public void setParameters(Configuration conf) { + for( Iterator> iter = conf.iterator(); iter.hasNext();) { + Map.Entry entry = iter.next(); + this.hiveConf.set(entry.getKey(), entry.getValue()); + } + } + + /** + * Add numClients to the client pool. + */ + public void addClients(int numClients) { + for (int i = 0; i < numClients; ++i) { + clientPool.add(new HCatalogStoreClient(hiveConf)); + } + } + + /** + * Gets a client from the pool. If the pool is empty a new client is created. + */ + public synchronized HCatalogStoreClient getClient() { + // The MetaStoreClient c'tor relies on knowing the Hadoop version by asking + // org.apache.hadoop.util.VersionInfo. The VersionInfo class relies on opening + // the 'common-version-info.properties' file as a resource from hadoop-common*.jar + // using the Thread's context classloader. If necessary, set the Thread's context + // classloader, otherwise VersionInfo will fail in it's c'tor. + if (Thread.currentThread().getContextClassLoader() == null) { + Thread.currentThread().setContextClassLoader(ClassLoader.getSystemClassLoader()); + } + + HCatalogStoreClient client = clientPool.poll(); + // The pool was empty so create a new client and return that. + if (client == null) { + client = new HCatalogStoreClient(hiveConf); + } + client.markInUse(); + + return client; + } + + /** + * Removes all items from the connection pool and closes all Hive Meta Store client + * connections. Can be called multiple times. + */ + public void close() { + // Ensure no more items get added to the pool once close is called. + if (poolClosed.getAndSet(true)) { + return; + } + + HCatalogStoreClient client = null; + while ((client = clientPool.poll()) != null) { + client.getHiveClient().close(); + } + } +} diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogUtil.java b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogUtil.java new file mode 100644 index 0000000000..8e8e58cc51 --- /dev/null +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/main/java/org/apache/tajo/catalog/store/HCatalogUtil.java @@ -0,0 +1,147 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.catalog.store; + +import com.google.common.base.Preconditions; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.metastore.IMetaStoreClient; +import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat; +import org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat; +import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hcatalog.common.HCatException; +import org.apache.hcatalog.data.schema.HCatFieldSchema; +import org.apache.hcatalog.data.schema.HCatSchema; +import org.apache.tajo.catalog.exception.CatalogException; +import org.apache.tajo.catalog.proto.CatalogProtos; +import org.apache.tajo.catalog.CatalogUtil; +import org.apache.tajo.common.TajoDataTypes; +import org.apache.thrift.TException; +import parquet.hadoop.mapred.DeprecatedParquetOutputFormat; + +public class HCatalogUtil { + protected final Log LOG = LogFactory.getLog(getClass()); + + public static void validateHCatTableAndTajoSchema(HCatSchema tblSchema) throws CatalogException { + for (HCatFieldSchema hcatField : tblSchema.getFields()) { + validateHCatFieldAndTajoSchema(hcatField); + } + } + + private static void validateHCatFieldAndTajoSchema(HCatFieldSchema fieldSchema) throws CatalogException { + try { + HCatFieldSchema.Type fieldType = fieldSchema.getType(); + switch (fieldType) { + case ARRAY: + throw new HCatException("Tajo cannot support array field type."); + case STRUCT: + throw new HCatException("Tajo cannot support struct field type."); + case MAP: + throw new HCatException("Tajo cannot support map field type."); + } + } catch (HCatException e) { + throw new CatalogException("incompatible hcatalog types when assigning to tajo type. - " + + "HCatFieldSchema:" + fieldSchema); + } + } + + public static TajoDataTypes.Type getTajoFieldType(String fieldType) { + Preconditions.checkNotNull(fieldType); + + String typeStr = null; + + if(fieldType.equalsIgnoreCase(serdeConstants.INT_TYPE_NAME)) + typeStr = "INT4"; + else if(fieldType.equalsIgnoreCase(serdeConstants.TINYINT_TYPE_NAME)) + typeStr = "INT1"; + else if(fieldType.equalsIgnoreCase(serdeConstants.SMALLINT_TYPE_NAME)) + typeStr = "INT2"; + else if(fieldType.equalsIgnoreCase(serdeConstants.BIGINT_TYPE_NAME)) + typeStr = "INT8"; + else if(fieldType.equalsIgnoreCase(serdeConstants.BOOLEAN_TYPE_NAME)) + typeStr = "BOOLEAN"; + else if(fieldType.equalsIgnoreCase(serdeConstants.FLOAT_TYPE_NAME)) + typeStr = "FLOAT4"; + else if(fieldType.equalsIgnoreCase(serdeConstants.DOUBLE_TYPE_NAME)) + typeStr = "FLOAT8"; + else if(fieldType.equalsIgnoreCase(serdeConstants.STRING_TYPE_NAME)) + typeStr = "TEXT"; + else if(fieldType.equalsIgnoreCase(serdeConstants.BINARY_TYPE_NAME)) + typeStr = "BLOB"; + + try { + return Enum.valueOf(TajoDataTypes.Type.class, typeStr); + } catch (IllegalArgumentException iae) { + throw new CatalogException("Cannot find a matched type against from '" + typeStr + "'"); + } + } + + public static String getHiveFieldType(TajoDataTypes.DataType dataType) { + Preconditions.checkNotNull(dataType); + + switch (dataType.getType()) { + case CHAR: return serdeConstants.CHAR_TYPE_NAME; + case BOOLEAN: return serdeConstants.BOOLEAN_TYPE_NAME; + case INT1: return serdeConstants.TINYINT_TYPE_NAME; + case INT2: return serdeConstants.SMALLINT_TYPE_NAME; + case INT4: return serdeConstants.INT_TYPE_NAME; + case INT8: return serdeConstants.BIGINT_TYPE_NAME; + case FLOAT4: return serdeConstants.FLOAT_TYPE_NAME; + case FLOAT8: return serdeConstants.DOUBLE_TYPE_NAME; + case TEXT: return serdeConstants.STRING_TYPE_NAME; + case VARCHAR: return serdeConstants.VARCHAR_TYPE_NAME; + case NCHAR: return serdeConstants.VARCHAR_TYPE_NAME; + case NVARCHAR: return serdeConstants.VARCHAR_TYPE_NAME; + case BINARY: return serdeConstants.BINARY_TYPE_NAME; + case VARBINARY: return serdeConstants.BINARY_TYPE_NAME; + case BLOB: return serdeConstants.BINARY_TYPE_NAME; + case DATE: return serdeConstants.DATE_TYPE_NAME; + case TIMESTAMP: return serdeConstants.TIMESTAMP_TYPE_NAME; + default: + throw new CatalogException(dataType + " is not supported."); + } + } + + public static String getStoreType(String fileFormat) { + Preconditions.checkNotNull(fileFormat); + + String[] fileFormatArrary = fileFormat.split("\\."); + if(fileFormatArrary.length < 1) { + throw new CatalogException("Hive file output format is wrong. - file output format:" + fileFormat); + } + + String outputFormatClass = fileFormatArrary[fileFormatArrary.length-1]; + if(outputFormatClass.equals(HiveIgnoreKeyTextOutputFormat.class.getSimpleName())) { + return CatalogUtil.TEXTFILE_NAME; + } else if(outputFormatClass.equals(HiveSequenceFileOutputFormat.class.getSimpleName())) { + return CatalogProtos.StoreType.SEQUENCEFILE.name(); + } else if(outputFormatClass.equals(RCFileOutputFormat.class.getSimpleName())) { + return CatalogProtos.StoreType.RCFILE.name(); + } else if(outputFormatClass.equals(DeprecatedParquetOutputFormat.class.getSimpleName())) { + return CatalogProtos.StoreType.PARQUET.name(); + } else { + throw new CatalogException("Not supported file output format. - file output format:" + fileFormat); + } + } + + public static Table getTable(IMetaStoreClient client, String dbName, String tableName) throws TException { + return new Table(client.getTable(dbName, tableName)); + } +} diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/test/java/org/apache/tajo/catalog/store/TestHCatalogStore.java b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/test/java/org/apache/tajo/catalog/store/TestHCatalogStore.java new file mode 100644 index 0000000000..725f665394 --- /dev/null +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/src/test/java/org/apache/tajo/catalog/store/TestHCatalogStore.java @@ -0,0 +1,402 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.catalog.store; + + +import org.apache.commons.lang.StringEscapeUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.tajo.catalog.CatalogUtil; +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.catalog.TableDesc; +import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.catalog.partition.PartitionMethodDesc; +import org.apache.tajo.catalog.proto.CatalogProtos; +import org.apache.tajo.common.TajoDataTypes; +import org.apache.tajo.conf.TajoConf; +import org.apache.tajo.storage.StorageConstants; +import org.apache.tajo.util.CommonTestingUtil; +import org.apache.tajo.util.KeyValueSet; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.IOException; +import java.util.List; + +import static org.junit.Assert.*; + +/** + * TestHCatalogStore. Test case for + * {@link org.apache.tajo.catalog.store.HCatalogStore} + */ + +public class TestHCatalogStore { + private static final String DB_NAME = "test_hive"; + private static final String CUSTOMER = "customer"; + private static final String NATION = "nation"; + private static final String REGION = "region"; + private static final String SUPPLIER = "supplier"; + + private static HCatalogStore store; + private static Path warehousePath; + + @BeforeClass + public static void setUp() throws Exception { + Path testPath = CommonTestingUtil.getTestDir(); + warehousePath = new Path(testPath, "warehouse"); + + //create local hiveMeta + HiveConf conf = new HiveConf(); + String jdbcUri = "jdbc:derby:;databaseName="+testPath.toUri().getPath()+"metastore_db;create=true"; + conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname, warehousePath.toUri().toString()); + conf.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname, jdbcUri); + conf.set(TajoConf.ConfVars.WAREHOUSE_DIR.varname, warehousePath.toUri().toString()); + + // create local HCatalogStore. + TajoConf tajoConf = new TajoConf(conf); + store = new HCatalogStore(tajoConf); + store.createDatabase(DB_NAME, null); + } + + @AfterClass + public static void tearDown() throws IOException { + store.close(); + } + + @Test + public void testTableUsingTextFile() throws Exception { + TableMeta meta = new TableMeta(CatalogProtos.StoreType.CSV, new KeyValueSet()); + + org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); + schema.addColumn("c_custkey", TajoDataTypes.Type.INT4); + schema.addColumn("c_name", TajoDataTypes.Type.TEXT); + schema.addColumn("c_address", TajoDataTypes.Type.TEXT); + schema.addColumn("c_nationkey", TajoDataTypes.Type.INT4); + schema.addColumn("c_phone", TajoDataTypes.Type.TEXT); + schema.addColumn("c_acctbal", TajoDataTypes.Type.FLOAT8); + schema.addColumn("c_mktsegment", TajoDataTypes.Type.TEXT); + schema.addColumn("c_comment", TajoDataTypes.Type.TEXT); + + TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, CUSTOMER), schema, meta, + new Path(warehousePath, new Path(DB_NAME, CUSTOMER)).toUri()); + store.createTable(table.getProto()); + assertTrue(store.existTable(DB_NAME, CUSTOMER)); + + TableDesc table1 = new TableDesc(store.getTable(DB_NAME, CUSTOMER)); + assertEquals(table.getName(), table1.getName()); + assertEquals(table.getPath(), table1.getPath()); + assertEquals(table.getSchema().size(), table1.getSchema().size()); + for (int i = 0; i < table.getSchema().size(); i++) { + assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); + } + + assertEquals(StringEscapeUtils.escapeJava(StorageConstants.DEFAULT_FIELD_DELIMITER), + table1.getMeta().getOption(StorageConstants.TEXT_DELIMITER)); + store.dropTable(DB_NAME, CUSTOMER); + } + + @Test + public void testTableUsingRCFileWithBinarySerde() throws Exception { + KeyValueSet options = new KeyValueSet(); + options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE); + TableMeta meta = new TableMeta(CatalogProtos.StoreType.RCFILE, options); + + org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); + schema.addColumn("r_regionkey", TajoDataTypes.Type.INT4); + schema.addColumn("r_name", TajoDataTypes.Type.TEXT); + schema.addColumn("r_comment", TajoDataTypes.Type.TEXT); + + TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, REGION), schema, meta, + new Path(warehousePath, new Path(DB_NAME, REGION)).toUri()); + store.createTable(table.getProto()); + assertTrue(store.existTable(DB_NAME, REGION)); + + TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION)); + assertEquals(table.getName(), table1.getName()); + assertEquals(table.getPath(), table1.getPath()); + assertEquals(table.getSchema().size(), table1.getSchema().size()); + for (int i = 0; i < table.getSchema().size(); i++) { + assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); + } + + assertEquals(StorageConstants.DEFAULT_BINARY_SERDE, + table1.getMeta().getOption(StorageConstants.RCFILE_SERDE)); + store.dropTable(DB_NAME, REGION); + } + + @Test + public void testTableUsingRCFileWithTextSerde() throws Exception { + KeyValueSet options = new KeyValueSet(); + options.set(StorageConstants.RCFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE); + TableMeta meta = new TableMeta(CatalogProtos.StoreType.RCFILE, options); + + org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); + schema.addColumn("r_regionkey", TajoDataTypes.Type.INT4); + schema.addColumn("r_name", TajoDataTypes.Type.TEXT); + schema.addColumn("r_comment", TajoDataTypes.Type.TEXT); + + TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, REGION), schema, meta, + new Path(warehousePath, new Path(DB_NAME, REGION)).toUri()); + store.createTable(table.getProto()); + assertTrue(store.existTable(DB_NAME, REGION)); + + TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION)); + assertEquals(table.getName(), table1.getName()); + assertEquals(table.getPath(), table1.getPath()); + assertEquals(table.getSchema().size(), table1.getSchema().size()); + for (int i = 0; i < table.getSchema().size(); i++) { + assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); + } + + assertEquals(StorageConstants.DEFAULT_TEXT_SERDE, table1.getMeta().getOption(StorageConstants.RCFILE_SERDE)); + store.dropTable(DB_NAME, REGION); + } + + @Test + public void testTableWithNullValue() throws Exception { + KeyValueSet options = new KeyValueSet(); + options.set(StorageConstants.TEXT_DELIMITER, StringEscapeUtils.escapeJava("\u0002")); + options.set(StorageConstants.TEXT_NULL, StringEscapeUtils.escapeJava("\u0003")); + TableMeta meta = new TableMeta(CatalogProtos.StoreType.CSV, options); + + org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); + schema.addColumn("s_suppkey", TajoDataTypes.Type.INT4); + schema.addColumn("s_name", TajoDataTypes.Type.TEXT); + schema.addColumn("s_address", TajoDataTypes.Type.TEXT); + schema.addColumn("s_nationkey", TajoDataTypes.Type.INT4); + schema.addColumn("s_phone", TajoDataTypes.Type.TEXT); + schema.addColumn("s_acctbal", TajoDataTypes.Type.FLOAT8); + schema.addColumn("s_comment", TajoDataTypes.Type.TEXT); + + TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, SUPPLIER), schema, meta, + new Path(warehousePath, new Path(DB_NAME, SUPPLIER)).toUri()); + + store.createTable(table.getProto()); + assertTrue(store.existTable(DB_NAME, SUPPLIER)); + + TableDesc table1 = new TableDesc(store.getTable(DB_NAME, SUPPLIER)); + assertEquals(table.getName(), table1.getName()); + assertEquals(table.getPath(), table1.getPath()); + assertEquals(table.getSchema().size(), table1.getSchema().size()); + for (int i = 0; i < table.getSchema().size(); i++) { + assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); + } + + assertEquals(table.getMeta().getOption(StorageConstants.TEXT_DELIMITER), + table1.getMeta().getOption(StorageConstants.TEXT_DELIMITER)); + + assertEquals(table.getMeta().getOption(StorageConstants.TEXT_NULL), + table1.getMeta().getOption(StorageConstants.TEXT_NULL)); + + assertEquals(table1.getMeta().getOption(StorageConstants.TEXT_DELIMITER), + StringEscapeUtils.escapeJava("\u0002")); + + assertEquals(table1.getMeta().getOption(StorageConstants.TEXT_NULL), + StringEscapeUtils.escapeJava("\u0003")); + + store.dropTable(DB_NAME, SUPPLIER); + + } + + @Test + public void testAddTableByPartition() throws Exception { + TableMeta meta = new TableMeta(CatalogProtos.StoreType.CSV, new KeyValueSet()); + + org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); + schema.addColumn("n_name", TajoDataTypes.Type.TEXT); + schema.addColumn("n_regionkey", TajoDataTypes.Type.INT4); + schema.addColumn("n_comment", TajoDataTypes.Type.TEXT); + + + TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, NATION), schema, meta, + new Path(warehousePath, new Path(DB_NAME, NATION)).toUri()); + + org.apache.tajo.catalog.Schema expressionSchema = new org.apache.tajo.catalog.Schema(); + expressionSchema.addColumn("n_nationkey", TajoDataTypes.Type.INT4); + + PartitionMethodDesc partitions = new PartitionMethodDesc( + DB_NAME, + NATION, + CatalogProtos.PartitionType.COLUMN, expressionSchema.getColumn(0).getQualifiedName(), expressionSchema); + table.setPartitionMethod(partitions); + + store.createTable(table.getProto()); + assertTrue(store.existTable(DB_NAME, NATION)); + + TableDesc table1 = new TableDesc(store.getTable(DB_NAME, NATION)); + assertEquals(table.getName(), table1.getName()); + assertEquals(table.getPath(), table1.getPath()); + assertEquals(table.getSchema().size(), table1.getSchema().size()); + for (int i = 0; i < table.getSchema().size(); i++) { + assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); + } + + + Schema partitionSchema = table.getPartitionMethod().getExpressionSchema(); + Schema partitionSchema1 = table1.getPartitionMethod().getExpressionSchema(); + assertEquals(partitionSchema.size(), partitionSchema1.size()); + for (int i = 0; i < partitionSchema.size(); i++) { + assertEquals(partitionSchema.getColumn(i).getSimpleName(), partitionSchema1.getColumn(i).getSimpleName()); + } + + store.dropTable(DB_NAME, NATION); + } + + + @Test + public void testGetAllTableNames() throws Exception{ + TableMeta meta = new TableMeta(CatalogProtos.StoreType.CSV, new KeyValueSet()); + org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); + schema.addColumn("n_name", TajoDataTypes.Type.TEXT); + schema.addColumn("n_regionkey", TajoDataTypes.Type.INT4); + schema.addColumn("n_comment", TajoDataTypes.Type.TEXT); + + String[] tableNames = new String[]{"table1", "table2", "table3"}; + + for(String tableName : tableNames){ + TableDesc table = new TableDesc(CatalogUtil.buildFQName("default", tableName), schema, meta, + new Path(warehousePath, new Path(DB_NAME, tableName)).toUri()); + store.createTable(table.getProto()); + } + + List tables = store.getAllTableNames("default"); + assertEquals(tableNames.length, tables.size()); + + for(String tableName : tableNames){ + assertTrue(tables.contains(tableName)); + } + + for(String tableName : tableNames){ + store.dropTable("default", tableName); + } + } + + @Test + public void testDeleteTable() throws Exception { + TableMeta meta = new TableMeta(CatalogProtos.StoreType.CSV, new KeyValueSet()); + org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); + schema.addColumn("n_name", TajoDataTypes.Type.TEXT); + schema.addColumn("n_regionkey", TajoDataTypes.Type.INT4); + schema.addColumn("n_comment", TajoDataTypes.Type.TEXT); + + String tableName = "table1"; + TableDesc table = new TableDesc(DB_NAME + "." + tableName, schema, meta, warehousePath.toUri()); + store.createTable(table.getProto()); + assertTrue(store.existTable(DB_NAME, tableName)); + + TableDesc table1 = new TableDesc(store.getTable(DB_NAME, tableName)); + FileSystem fs = FileSystem.getLocal(new Configuration()); + assertTrue(fs.exists(new Path(table1.getPath()))); + + store.dropTable(DB_NAME, tableName); + assertFalse(store.existTable(DB_NAME, tableName)); + fs.close(); + } + + @Test + public void testTableUsingSequenceFileWithBinarySerde() throws Exception { + KeyValueSet options = new KeyValueSet(); + options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_BINARY_SERDE); + TableMeta meta = new TableMeta(CatalogProtos.StoreType.SEQUENCEFILE, options); + + org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); + schema.addColumn("r_regionkey", TajoDataTypes.Type.INT4); + schema.addColumn("r_name", TajoDataTypes.Type.TEXT); + schema.addColumn("r_comment", TajoDataTypes.Type.TEXT); + + TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, REGION), schema, meta, + new Path(warehousePath, new Path(DB_NAME, REGION)).toUri()); + store.createTable(table.getProto()); + assertTrue(store.existTable(DB_NAME, REGION)); + + TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION)); + assertEquals(table.getName(), table1.getName()); + assertEquals(table.getPath(), table1.getPath()); + assertEquals(table.getSchema().size(), table1.getSchema().size()); + for (int i = 0; i < table.getSchema().size(); i++) { + assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); + } + + assertEquals(StorageConstants.DEFAULT_BINARY_SERDE, + table1.getMeta().getOption(StorageConstants.SEQUENCEFILE_SERDE)); + store.dropTable(DB_NAME, REGION); + } + + @Test + public void testTableUsingSequenceFileWithTextSerde() throws Exception { + KeyValueSet options = new KeyValueSet(); + options.set(StorageConstants.SEQUENCEFILE_SERDE, StorageConstants.DEFAULT_TEXT_SERDE); + TableMeta meta = new TableMeta(CatalogProtos.StoreType.SEQUENCEFILE, options); + + org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); + schema.addColumn("r_regionkey", TajoDataTypes.Type.INT4); + schema.addColumn("r_name", TajoDataTypes.Type.TEXT); + schema.addColumn("r_comment", TajoDataTypes.Type.TEXT); + + TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, REGION), schema, meta, + new Path(warehousePath, new Path(DB_NAME, REGION)).toUri()); + store.createTable(table.getProto()); + assertTrue(store.existTable(DB_NAME, REGION)); + + TableDesc table1 = new TableDesc(store.getTable(DB_NAME, REGION)); + assertEquals(table.getName(), table1.getName()); + assertEquals(table.getPath(), table1.getPath()); + assertEquals(table.getSchema().size(), table1.getSchema().size()); + for (int i = 0; i < table.getSchema().size(); i++) { + assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); + } + + assertEquals(StorageConstants.DEFAULT_TEXT_SERDE, table1.getMeta().getOption(StorageConstants.SEQUENCEFILE_SERDE)); + store.dropTable(DB_NAME, REGION); + } + + + @Test + public void testTableUsingParquet() throws Exception { + TableMeta meta = new TableMeta(CatalogProtos.StoreType.PARQUET, new KeyValueSet()); + + org.apache.tajo.catalog.Schema schema = new org.apache.tajo.catalog.Schema(); + schema.addColumn("c_custkey", TajoDataTypes.Type.INT4); + schema.addColumn("c_name", TajoDataTypes.Type.TEXT); + schema.addColumn("c_address", TajoDataTypes.Type.TEXT); + schema.addColumn("c_nationkey", TajoDataTypes.Type.INT4); + schema.addColumn("c_phone", TajoDataTypes.Type.TEXT); + schema.addColumn("c_acctbal", TajoDataTypes.Type.FLOAT8); + schema.addColumn("c_mktsegment", TajoDataTypes.Type.TEXT); + schema.addColumn("c_comment", TajoDataTypes.Type.TEXT); + + TableDesc table = new TableDesc(CatalogUtil.buildFQName(DB_NAME, CUSTOMER), schema, meta, + new Path(warehousePath, new Path(DB_NAME, CUSTOMER)).toUri()); + store.createTable(table.getProto()); + assertTrue(store.existTable(DB_NAME, CUSTOMER)); + + TableDesc table1 = new TableDesc(store.getTable(DB_NAME, CUSTOMER)); + assertEquals(table.getName(), table1.getName()); + assertEquals(table.getPath(), table1.getPath()); + assertEquals(table.getSchema().size(), table1.getSchema().size()); + for (int i = 0; i < table.getSchema().size(); i++) { + assertEquals(table.getSchema().getColumn(i).getSimpleName(), table1.getSchema().getColumn(i).getSimpleName()); + } + + store.dropTable(DB_NAME, CUSTOMER); + } +} From 2af644774e7f6c2d3b85339df8c871d95996e5c6 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Fri, 15 May 2015 11:49:55 +0900 Subject: [PATCH 037/141] TAJO-1605: Fix master build failure on jdk 1.6. (jinho) --- CHANGES | 2 ++ .../org/apache/tajo/service/HAServiceTracker.java | 11 ++++++++--- .../apache/tajo/worker/WorkerHeartbeatService.java | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGES b/CHANGES index 39e2dfec9a..355bf3081f 100644 --- a/CHANGES +++ b/CHANGES @@ -39,6 +39,8 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1605: Fix master build failure on jdk 1.6. (jinho) + TAJO-1586: TajoMaster HA startup failure on Yarn. (jaehwa) TAJO-1485: Datum 'Char' returned only 1byte. diff --git a/tajo-common/src/main/java/org/apache/tajo/service/HAServiceTracker.java b/tajo-common/src/main/java/org/apache/tajo/service/HAServiceTracker.java index 081b1530a3..8c553e910b 100644 --- a/tajo-common/src/main/java/org/apache/tajo/service/HAServiceTracker.java +++ b/tajo-common/src/main/java/org/apache/tajo/service/HAServiceTracker.java @@ -21,10 +21,9 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.net.NetUtils; -import org.apache.tajo.conf.TajoConf; -import org.apache.tajo.util.FileUtil; import javax.net.SocketFactory; +import java.io.IOException; import java.net.InetSocketAddress; import java.net.Socket; @@ -59,7 +58,13 @@ public static boolean checkConnection(InetSocketAddress address) { } catch (Exception e) { isAlive = false; } finally { - FileUtil.cleanup(LOG, socket); + if (socket != null) { + try { + socket.close(); + } catch (IOException e) { + LOG.debug(e.getMessage(), e); + } + } } return isAlive; } diff --git a/tajo-core/src/main/java/org/apache/tajo/worker/WorkerHeartbeatService.java b/tajo-core/src/main/java/org/apache/tajo/worker/WorkerHeartbeatService.java index ad67f94adb..bb52350bff 100644 --- a/tajo-core/src/main/java/org/apache/tajo/worker/WorkerHeartbeatService.java +++ b/tajo-core/src/main/java/org/apache/tajo/worker/WorkerHeartbeatService.java @@ -76,12 +76,12 @@ public void serviceInit(Configuration conf) throws Exception { this.systemConf = (TajoConf) conf; this.connectionManager = RpcClientManager.getInstance(); + thread = new WorkerHeartbeatThread(); super.serviceInit(conf); } @Override public void serviceStart() throws Exception { - thread = new WorkerHeartbeatThread(); thread.start(); super.serviceStart(); } From 8d4b478d4b73339f3b7a0a63dc2dc5b51023629c Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Fri, 15 May 2015 00:51:40 -0700 Subject: [PATCH 038/141] Change the version of pom.xml files to 0.10.0-SNAPSHOT. --- pom.xml | 2 +- tajo-algebra/pom.xml | 2 +- tajo-catalog/pom.xml | 2 +- tajo-catalog/tajo-catalog-client/pom.xml | 2 +- tajo-catalog/tajo-catalog-common/pom.xml | 2 +- tajo-catalog/tajo-catalog-drivers/pom.xml | 2 +- tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml | 2 +- tajo-catalog/tajo-catalog-server/pom.xml | 2 +- tajo-cli/pom.xml | 2 +- tajo-client/pom.xml | 2 +- tajo-common/pom.xml | 2 +- tajo-core/pom.xml | 2 +- tajo-dist/pom.xml | 2 +- tajo-docs/pom.xml | 2 +- tajo-jdbc/pom.xml | 2 +- tajo-maven-plugins/pom.xml | 2 +- tajo-plan/pom.xml | 2 +- tajo-project/pom.xml | 4 ++-- tajo-pullserver/pom.xml | 2 +- tajo-rpc/pom.xml | 2 +- tajo-storage/pom.xml | 2 +- tajo-storage/tajo-storage-common/pom.xml | 4 ++-- tajo-storage/tajo-storage-hbase/pom.xml | 4 ++-- tajo-storage/tajo-storage-hdfs/pom.xml | 4 ++-- tajo-thirdparty/asm/pom.xml | 2 +- 25 files changed, 29 insertions(+), 29 deletions(-) diff --git a/pom.xml b/pom.xml index b6f0d16815..8ad9d3bd59 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ org.apache.tajo tajo-main - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT Tajo Main Tajo Main pom diff --git a/tajo-algebra/pom.xml b/tajo-algebra/pom.xml index 5b19e1ac9b..66a86436fb 100644 --- a/tajo-algebra/pom.xml +++ b/tajo-algebra/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-algebra diff --git a/tajo-catalog/pom.xml b/tajo-catalog/pom.xml index 9cfdd5d3a0..3745785cc8 100644 --- a/tajo-catalog/pom.xml +++ b/tajo-catalog/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-client/pom.xml b/tajo-catalog/tajo-catalog-client/pom.xml index 98b85a8320..46db4f2acc 100644 --- a/tajo-catalog/tajo-catalog-client/pom.xml +++ b/tajo-catalog/tajo-catalog-client/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-common/pom.xml b/tajo-catalog/tajo-catalog-common/pom.xml index 9164996b16..2622b895ca 100644 --- a/tajo-catalog/tajo-catalog-common/pom.xml +++ b/tajo-catalog/tajo-catalog-common/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-drivers/pom.xml b/tajo-catalog/tajo-catalog-drivers/pom.xml index 89f3061126..221a2b4404 100644 --- a/tajo-catalog/tajo-catalog-drivers/pom.xml +++ b/tajo-catalog/tajo-catalog-drivers/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml index fe8f34a436..80428d6be1 100644 --- a/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml +++ b/tajo-catalog/tajo-catalog-drivers/tajo-hcatalog/pom.xml @@ -23,7 +23,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-server/pom.xml b/tajo-catalog/tajo-catalog-server/pom.xml index 501f9af611..2c5a4bd7cb 100644 --- a/tajo-catalog/tajo-catalog-server/pom.xml +++ b/tajo-catalog/tajo-catalog-server/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-cli/pom.xml b/tajo-cli/pom.xml index 684c298fb7..8faf1196d4 100644 --- a/tajo-cli/pom.xml +++ b/tajo-cli/pom.xml @@ -23,7 +23,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-cli diff --git a/tajo-client/pom.xml b/tajo-client/pom.xml index 692e1b5287..a1ffc97f7f 100644 --- a/tajo-client/pom.xml +++ b/tajo-client/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-client diff --git a/tajo-common/pom.xml b/tajo-common/pom.xml index 2f96a2cee3..58b4b589d5 100644 --- a/tajo-common/pom.xml +++ b/tajo-common/pom.xml @@ -17,7 +17,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project diff --git a/tajo-core/pom.xml b/tajo-core/pom.xml index 42b143b1ad..bc11d1c0c9 100644 --- a/tajo-core/pom.xml +++ b/tajo-core/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-core diff --git a/tajo-dist/pom.xml b/tajo-dist/pom.xml index aed7b4be43..bf7c8146ee 100644 --- a/tajo-dist/pom.xml +++ b/tajo-dist/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-docs/pom.xml b/tajo-docs/pom.xml index a1e2a142bd..71cfebd2b9 100644 --- a/tajo-docs/pom.xml +++ b/tajo-docs/pom.xml @@ -19,7 +19,7 @@ limitations under the License. tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-docs diff --git a/tajo-jdbc/pom.xml b/tajo-jdbc/pom.xml index de4520a316..0db314a0c4 100644 --- a/tajo-jdbc/pom.xml +++ b/tajo-jdbc/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-jdbc diff --git a/tajo-maven-plugins/pom.xml b/tajo-maven-plugins/pom.xml index 3b4dd10b82..fe0dc9e7c1 100644 --- a/tajo-maven-plugins/pom.xml +++ b/tajo-maven-plugins/pom.xml @@ -17,7 +17,7 @@ org.apache.tajo tajo-project - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project org.apache.tajo diff --git a/tajo-plan/pom.xml b/tajo-plan/pom.xml index ee68a00e75..d72d8953af 100644 --- a/tajo-plan/pom.xml +++ b/tajo-plan/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-plan diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml index d9685ee670..2f6f53f752 100644 --- a/tajo-project/pom.xml +++ b/tajo-project/pom.xml @@ -22,7 +22,7 @@ org.apache.tajo tajo-main - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT org.apache.tajo tajo-project @@ -35,7 +35,7 @@ UTF-8 2.6.0 2.5.0 - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT 0.98.7-hadoop2 4.0.25.Final ${project.parent.relativePath}/.. diff --git a/tajo-pullserver/pom.xml b/tajo-pullserver/pom.xml index 944cf3dd52..71aa5fc0ac 100644 --- a/tajo-pullserver/pom.xml +++ b/tajo-pullserver/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-rpc/pom.xml b/tajo-rpc/pom.xml index 8c626b4bc9..7061722e02 100644 --- a/tajo-rpc/pom.xml +++ b/tajo-rpc/pom.xml @@ -20,7 +20,7 @@ 4.0.0 tajo-project - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT org.apache.tajo ../tajo-project diff --git a/tajo-storage/pom.xml b/tajo-storage/pom.xml index 913e7192ad..58a010fa7a 100644 --- a/tajo-storage/pom.xml +++ b/tajo-storage/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-storage/tajo-storage-common/pom.xml b/tajo-storage/tajo-storage-common/pom.xml index f7c9676799..ad652e1c2f 100644 --- a/tajo-storage/tajo-storage-common/pom.xml +++ b/tajo-storage/tajo-storage-common/pom.xml @@ -21,7 +21,7 @@ limitations under the License. tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 @@ -334,4 +334,4 @@ limitations under the License. - \ No newline at end of file + diff --git a/tajo-storage/tajo-storage-hbase/pom.xml b/tajo-storage/tajo-storage-hbase/pom.xml index 3456b76a25..1187b042e7 100644 --- a/tajo-storage/tajo-storage-hbase/pom.xml +++ b/tajo-storage/tajo-storage-hbase/pom.xml @@ -23,7 +23,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 @@ -354,4 +354,4 @@ - \ No newline at end of file + diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 3a59ec9f23..f3d5f23e17 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -23,7 +23,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 @@ -395,4 +395,4 @@ - \ No newline at end of file + diff --git a/tajo-thirdparty/asm/pom.xml b/tajo-thirdparty/asm/pom.xml index c781a52ab1..a085866adb 100644 --- a/tajo-thirdparty/asm/pom.xml +++ b/tajo-thirdparty/asm/pom.xml @@ -17,7 +17,7 @@ tajo-project org.apache.tajo - 0.11.0-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project From 02c1bd0d2df5cd747da4d83b919441414db4202b Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 27 Mar 2015 17:54:20 +0900 Subject: [PATCH 039/141] TAJO-1440: Some tests fail in parallel test environment in TestKillQuery. (Contributed by Jongyoung Park. Committed by jinho) --- CHANGES | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES b/CHANGES index 355bf3081f..bb1821bbd8 100644 --- a/CHANGES +++ b/CHANGES @@ -39,6 +39,9 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1440: Some tests fail in parallel test environment in TestKillQuery. + (Contributed by Jongyoung Park. Committed by jinho) + TAJO-1605: Fix master build failure on jdk 1.6. (jinho) TAJO-1586: TajoMaster HA startup failure on Yarn. (jaehwa) From 8864f8537e2499645fbd161ea50d9449a15451d0 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Mon, 18 May 2015 01:52:15 -0700 Subject: [PATCH 040/141] TAJO-1612: TestKillQuery occassionally fails. Closes #575 --- CHANGES | 7 +- .../tajo/querymaster/TestKillQuery.java | 66 +++++++++++++------ 2 files changed, 51 insertions(+), 22 deletions(-) diff --git a/CHANGES b/CHANGES index bb1821bbd8..50b735c296 100644 --- a/CHANGES +++ b/CHANGES @@ -12,8 +12,9 @@ Release 0.10.1 - unreleased TAJO-1452: Improve function listing order (Contributed Dongjoon Hyun, Committed by hyunsik) - TAJO-1576: Sometimes DefaultTajoCliOutputFormatter.parseErrorMessage() eliminates - an important kind of information. (Contributed by Jongyoung Park, Committed by jihoon) + TAJO-1576: Sometimes DefaultTajoCliOutputFormatter.parseErrorMessage() + eliminates an important kind of information. (Contributed by Jongyoung + Park, Committed by jihoon) TAJO-1381: Support multi-bytes delimiter for Text file. (Contributed by navis, Committed by jinho) @@ -39,6 +40,8 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1612: TestKillQuery occassionally fails. (hyunsik) + TAJO-1440: Some tests fail in parallel test environment in TestKillQuery. (Contributed by Jongyoung Park. Committed by jinho) diff --git a/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java b/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java index b2e1ce9587..89cac75bea 100644 --- a/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java +++ b/tajo-core/src/test/java/org/apache/tajo/querymaster/TestKillQuery.java @@ -33,11 +33,7 @@ import org.apache.tajo.engine.planner.global.MasterPlan; import org.apache.tajo.engine.query.QueryContext; import org.apache.tajo.engine.query.TaskRequestImpl; -import org.apache.tajo.ipc.ClientProtos; -import org.apache.tajo.master.event.QueryEvent; -import org.apache.tajo.master.event.QueryEventType; -import org.apache.tajo.master.event.StageEvent; -import org.apache.tajo.master.event.StageEventType; +import org.apache.tajo.master.event.*; import org.apache.tajo.plan.LogicalOptimizer; import org.apache.tajo.plan.LogicalPlan; import org.apache.tajo.plan.LogicalPlanner; @@ -131,8 +127,7 @@ public final void testKillQueryFromInitState() throws Exception { assertNotNull(stage); // fire kill event - Query q = queryMasterTask.getQuery(); - q.handle(new QueryEvent(queryId, QueryEventType.KILL)); + queryMasterTask.getEventHandler().handle(new QueryEvent(queryId, QueryEventType.KILL)); try { cluster.waitForQueryState(queryMasterTask.getQuery(), TajoProtos.QueryState.QUERY_KILLED, 50); @@ -157,24 +152,55 @@ public final void testKillQueryFromInitState() throws Exception { @Test public final void testIgnoreStageStateFromKilled() throws Exception { - ClientProtos.SubmitQueryResponse res = client.executeQuery(queryStr); - QueryId queryId = new QueryId(res.getQueryId()); - cluster.waitForQuerySubmitted(queryId); + SQLAnalyzer analyzer = new SQLAnalyzer(); + QueryContext defaultContext = LocalTajoTestingUtility.createDummyContext(conf); + Session session = LocalTajoTestingUtility.createDummySession(); + CatalogService catalog = cluster.getMaster().getCatalog(); + + LogicalPlanner planner = new LogicalPlanner(catalog); + LogicalOptimizer optimizer = new LogicalOptimizer(conf); + Expr expr = analyzer.parse(queryStr); + LogicalPlan plan = planner.createPlan(defaultContext, expr); + + optimizer.optimize(plan); + + QueryId queryId = QueryIdFactory.newQueryId(System.currentTimeMillis(), 0); + QueryContext queryContext = new QueryContext(conf); + MasterPlan masterPlan = new MasterPlan(queryId, queryContext, plan); + GlobalPlanner globalPlanner = new GlobalPlanner(conf, catalog); + globalPlanner.build(masterPlan); - QueryMasterTask qmt = cluster.getQueryMasterTask(queryId); - Query query = qmt.getQuery(); + CountDownLatch barrier = new CountDownLatch(1); + MockAsyncDispatch dispatch = new MockAsyncDispatch(barrier, TajoProtos.QueryState.QUERY_RUNNING); + + QueryMaster qm = cluster.getTajoWorkers().get(0).getWorkerContext().getQueryMaster(); + QueryMasterTask queryMasterTask = new QueryMasterTask(qm.getContext(), + queryId, session, defaultContext, expr.toJson(), dispatch); - // wait for a stage created - cluster.waitForQueryState(query, TajoProtos.QueryState.QUERY_RUNNING, 10); - query.handle(new QueryEvent(queryId, QueryEventType.KILL)); + queryMasterTask.init(conf); + queryMasterTask.getQueryTaskContext().getDispatcher().start(); + queryMasterTask.startQuery(); try{ - cluster.waitForQueryState(query, TajoProtos.QueryState.QUERY_KILLED, 50); - } finally { - assertEquals(TajoProtos.QueryState.QUERY_KILLED, query.getSynchronizedState()); + barrier.await(5000, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + fail("Query state : " + queryMasterTask.getQuery().getSynchronizedState()); + } + + Stage stage = queryMasterTask.getQuery().getStages().iterator().next(); + assertNotNull(stage); + + // fire kill event + queryMasterTask.getEventHandler().handle(new QueryEvent(queryId, QueryEventType.KILL)); + + try { + cluster.waitForQueryState(queryMasterTask.getQuery(), TajoProtos.QueryState.QUERY_KILLED, 50); + assertEquals(TajoProtos.QueryState.QUERY_KILLED, queryMasterTask.getQuery().getSynchronizedState()); + } finally { + queryMasterTask.stop(); } - List stages = Lists.newArrayList(query.getStages()); + List stages = Lists.newArrayList(queryMasterTask.getQuery().getStages()); Stage lastStage = stages.get(stages.size() - 1); assertEquals(StageState.KILLED, lastStage.getSynchronizedState()); @@ -244,4 +270,4 @@ protected void dispatch(Event event) { super.dispatch(event); } } -} +} \ No newline at end of file From 7aeffd7e0d2f440c08abaefbb3f8131d595d1afd Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Mon, 18 May 2015 02:27:10 -0700 Subject: [PATCH 041/141] [maven-release-plugin] prepare release release-0.10.1-rc0 --- pom.xml | 4 ++-- tajo-algebra/pom.xml | 2 +- tajo-catalog/pom.xml | 2 +- tajo-catalog/tajo-catalog-client/pom.xml | 2 +- tajo-catalog/tajo-catalog-common/pom.xml | 2 +- tajo-catalog/tajo-catalog-drivers/pom.xml | 2 +- tajo-catalog/tajo-catalog-server/pom.xml | 2 +- tajo-cli/pom.xml | 5 ++--- tajo-client/pom.xml | 2 +- tajo-common/pom.xml | 2 +- tajo-core/pom.xml | 2 +- tajo-dist/pom.xml | 2 +- tajo-jdbc/pom.xml | 2 +- tajo-maven-plugins/pom.xml | 2 +- tajo-plan/pom.xml | 2 +- tajo-project/pom.xml | 10 +++++----- tajo-pullserver/pom.xml | 2 +- tajo-rpc/pom.xml | 2 +- tajo-storage/pom.xml | 2 +- tajo-storage/tajo-storage-common/pom.xml | 4 ++-- tajo-storage/tajo-storage-hbase/pom.xml | 6 ++---- tajo-storage/tajo-storage-hdfs/pom.xml | 6 ++---- tajo-thirdparty/asm/pom.xml | 2 +- 23 files changed, 32 insertions(+), 37 deletions(-) diff --git a/pom.xml b/pom.xml index 8ad9d3bd59..280b7b8b3b 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ org.apache.tajo tajo-main - 0.10.1-SNAPSHOT + 0.10.1 Tajo Main Tajo Main pom @@ -57,7 +57,7 @@ https://git-wip-us.apache.org/repos/asf/tajo.git scm:git:https://git-wip-us.apache.org/repos/asf/tajo.git scm:git:https://git-wip-us.apache.org/repos/asf/tajo.git - HEAD + release-0.10.1-rc0 diff --git a/tajo-algebra/pom.xml b/tajo-algebra/pom.xml index 66a86436fb..3cf4e010c3 100644 --- a/tajo-algebra/pom.xml +++ b/tajo-algebra/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project tajo-algebra diff --git a/tajo-catalog/pom.xml b/tajo-catalog/pom.xml index 3745785cc8..9a23f7ebd0 100644 --- a/tajo-catalog/pom.xml +++ b/tajo-catalog/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-client/pom.xml b/tajo-catalog/tajo-catalog-client/pom.xml index 46db4f2acc..22d99a5ab4 100644 --- a/tajo-catalog/tajo-catalog-client/pom.xml +++ b/tajo-catalog/tajo-catalog-client/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-common/pom.xml b/tajo-catalog/tajo-catalog-common/pom.xml index 2622b895ca..47bd0f0442 100644 --- a/tajo-catalog/tajo-catalog-common/pom.xml +++ b/tajo-catalog/tajo-catalog-common/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-drivers/pom.xml b/tajo-catalog/tajo-catalog-drivers/pom.xml index 221a2b4404..85bb54b7da 100644 --- a/tajo-catalog/tajo-catalog-drivers/pom.xml +++ b/tajo-catalog/tajo-catalog-drivers/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-server/pom.xml b/tajo-catalog/tajo-catalog-server/pom.xml index 2c5a4bd7cb..6cf4193332 100644 --- a/tajo-catalog/tajo-catalog-server/pom.xml +++ b/tajo-catalog/tajo-catalog-server/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../../tajo-project 4.0.0 diff --git a/tajo-cli/pom.xml b/tajo-cli/pom.xml index 8faf1196d4..b6c94c43a9 100644 --- a/tajo-cli/pom.xml +++ b/tajo-cli/pom.xml @@ -17,13 +17,12 @@ ~ limitations under the License. --> - + 4.0.0 tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project tajo-cli diff --git a/tajo-client/pom.xml b/tajo-client/pom.xml index a1ffc97f7f..8f7a8430de 100644 --- a/tajo-client/pom.xml +++ b/tajo-client/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project tajo-client diff --git a/tajo-common/pom.xml b/tajo-common/pom.xml index 58b4b589d5..b63eee94dd 100644 --- a/tajo-common/pom.xml +++ b/tajo-common/pom.xml @@ -17,7 +17,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project diff --git a/tajo-core/pom.xml b/tajo-core/pom.xml index bc11d1c0c9..5c4d039711 100644 --- a/tajo-core/pom.xml +++ b/tajo-core/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project tajo-core diff --git a/tajo-dist/pom.xml b/tajo-dist/pom.xml index bf7c8146ee..57ed18373d 100644 --- a/tajo-dist/pom.xml +++ b/tajo-dist/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project 4.0.0 diff --git a/tajo-jdbc/pom.xml b/tajo-jdbc/pom.xml index 0db314a0c4..89599371cd 100644 --- a/tajo-jdbc/pom.xml +++ b/tajo-jdbc/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project tajo-jdbc diff --git a/tajo-maven-plugins/pom.xml b/tajo-maven-plugins/pom.xml index fe0dc9e7c1..0e667b6563 100644 --- a/tajo-maven-plugins/pom.xml +++ b/tajo-maven-plugins/pom.xml @@ -17,7 +17,7 @@ org.apache.tajo tajo-project - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project org.apache.tajo diff --git a/tajo-plan/pom.xml b/tajo-plan/pom.xml index d72d8953af..e6e1eb2e16 100644 --- a/tajo-plan/pom.xml +++ b/tajo-plan/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project tajo-plan diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml index 2f6f53f752..58b3b7d600 100644 --- a/tajo-project/pom.xml +++ b/tajo-project/pom.xml @@ -22,7 +22,7 @@ org.apache.tajo tajo-main - 0.10.1-SNAPSHOT + 0.10.1 org.apache.tajo tajo-project @@ -35,7 +35,7 @@ UTF-8 2.6.0 2.5.0 - 0.10.1-SNAPSHOT + 0.10.1 0.98.7-hadoop2 4.0.25.Final ${project.parent.relativePath}/.. @@ -108,7 +108,7 @@ hjkim Hyoung Jun Kim hjkim@apache.org - + PMC @@ -168,7 +168,7 @@ jihun Jihun Kang jihun@apache.org - + Committer @@ -326,7 +326,7 @@ https://git-wip-us.apache.org/repos/asf/tajo.git scm:git:https://git-wip-us.apache.org/repos/asf/tajo.git scm:git:https://git-wip-us.apache.org/repos/asf/tajo.git - HEAD + release-0.10.1-rc0 diff --git a/tajo-pullserver/pom.xml b/tajo-pullserver/pom.xml index 71aa5fc0ac..ae174ac378 100644 --- a/tajo-pullserver/pom.xml +++ b/tajo-pullserver/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project 4.0.0 diff --git a/tajo-rpc/pom.xml b/tajo-rpc/pom.xml index 7061722e02..a1727e259d 100644 --- a/tajo-rpc/pom.xml +++ b/tajo-rpc/pom.xml @@ -20,7 +20,7 @@ 4.0.0 tajo-project - 0.10.1-SNAPSHOT + 0.10.1 org.apache.tajo ../tajo-project diff --git a/tajo-storage/pom.xml b/tajo-storage/pom.xml index 58a010fa7a..4efac11434 100644 --- a/tajo-storage/pom.xml +++ b/tajo-storage/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../tajo-project 4.0.0 diff --git a/tajo-storage/tajo-storage-common/pom.xml b/tajo-storage/tajo-storage-common/pom.xml index ad652e1c2f..f3884797a0 100644 --- a/tajo-storage/tajo-storage-common/pom.xml +++ b/tajo-storage/tajo-storage-common/pom.xml @@ -17,11 +17,11 @@ See the License for the specific language governing permissions and limitations under the License. --> - + tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../../tajo-project 4.0.0 diff --git a/tajo-storage/tajo-storage-hbase/pom.xml b/tajo-storage/tajo-storage-hbase/pom.xml index 1187b042e7..ebb8e24744 100644 --- a/tajo-storage/tajo-storage-hbase/pom.xml +++ b/tajo-storage/tajo-storage-hbase/pom.xml @@ -17,13 +17,11 @@ limitations under the License. --> - + tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../../tajo-project 4.0.0 diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index f3d5f23e17..985eb50e6f 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -17,13 +17,11 @@ limitations under the License. --> - + tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../../tajo-project 4.0.0 diff --git a/tajo-thirdparty/asm/pom.xml b/tajo-thirdparty/asm/pom.xml index a085866adb..c02ebd6d51 100644 --- a/tajo-thirdparty/asm/pom.xml +++ b/tajo-thirdparty/asm/pom.xml @@ -17,7 +17,7 @@ tajo-project org.apache.tajo - 0.10.1-SNAPSHOT + 0.10.1 ../../tajo-project From 1e99099bd6b77c2309cb2329115f57eb916c9be9 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Mon, 18 May 2015 02:27:33 -0700 Subject: [PATCH 042/141] [maven-release-plugin] prepare for next development iteration --- pom.xml | 4 ++-- tajo-algebra/pom.xml | 2 +- tajo-catalog/pom.xml | 2 +- tajo-catalog/tajo-catalog-client/pom.xml | 2 +- tajo-catalog/tajo-catalog-common/pom.xml | 2 +- tajo-catalog/tajo-catalog-drivers/pom.xml | 2 +- tajo-catalog/tajo-catalog-server/pom.xml | 2 +- tajo-cli/pom.xml | 2 +- tajo-client/pom.xml | 2 +- tajo-common/pom.xml | 2 +- tajo-core/pom.xml | 2 +- tajo-dist/pom.xml | 2 +- tajo-jdbc/pom.xml | 2 +- tajo-maven-plugins/pom.xml | 2 +- tajo-plan/pom.xml | 2 +- tajo-project/pom.xml | 6 +++--- tajo-pullserver/pom.xml | 2 +- tajo-rpc/pom.xml | 2 +- tajo-storage/pom.xml | 2 +- tajo-storage/tajo-storage-common/pom.xml | 2 +- tajo-storage/tajo-storage-hbase/pom.xml | 2 +- tajo-storage/tajo-storage-hdfs/pom.xml | 2 +- tajo-thirdparty/asm/pom.xml | 2 +- 23 files changed, 26 insertions(+), 26 deletions(-) diff --git a/pom.xml b/pom.xml index 280b7b8b3b..1f0c5ff5e0 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ org.apache.tajo tajo-main - 0.10.1 + 0.10.2-SNAPSHOT Tajo Main Tajo Main pom @@ -57,7 +57,7 @@ https://git-wip-us.apache.org/repos/asf/tajo.git scm:git:https://git-wip-us.apache.org/repos/asf/tajo.git scm:git:https://git-wip-us.apache.org/repos/asf/tajo.git - release-0.10.1-rc0 + HEAD diff --git a/tajo-algebra/pom.xml b/tajo-algebra/pom.xml index 3cf4e010c3..95151eb06d 100644 --- a/tajo-algebra/pom.xml +++ b/tajo-algebra/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project tajo-algebra diff --git a/tajo-catalog/pom.xml b/tajo-catalog/pom.xml index 9a23f7ebd0..c722368699 100644 --- a/tajo-catalog/pom.xml +++ b/tajo-catalog/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-client/pom.xml b/tajo-catalog/tajo-catalog-client/pom.xml index 22d99a5ab4..3be6587470 100644 --- a/tajo-catalog/tajo-catalog-client/pom.xml +++ b/tajo-catalog/tajo-catalog-client/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-common/pom.xml b/tajo-catalog/tajo-catalog-common/pom.xml index 47bd0f0442..7af058151b 100644 --- a/tajo-catalog/tajo-catalog-common/pom.xml +++ b/tajo-catalog/tajo-catalog-common/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-drivers/pom.xml b/tajo-catalog/tajo-catalog-drivers/pom.xml index 85bb54b7da..00a7ebd108 100644 --- a/tajo-catalog/tajo-catalog-drivers/pom.xml +++ b/tajo-catalog/tajo-catalog-drivers/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-server/pom.xml b/tajo-catalog/tajo-catalog-server/pom.xml index 6cf4193332..27c58a54b5 100644 --- a/tajo-catalog/tajo-catalog-server/pom.xml +++ b/tajo-catalog/tajo-catalog-server/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-cli/pom.xml b/tajo-cli/pom.xml index b6c94c43a9..b955681bde 100644 --- a/tajo-cli/pom.xml +++ b/tajo-cli/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project tajo-cli diff --git a/tajo-client/pom.xml b/tajo-client/pom.xml index 8f7a8430de..08f75fee0d 100644 --- a/tajo-client/pom.xml +++ b/tajo-client/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project tajo-client diff --git a/tajo-common/pom.xml b/tajo-common/pom.xml index b63eee94dd..475e47f87e 100644 --- a/tajo-common/pom.xml +++ b/tajo-common/pom.xml @@ -17,7 +17,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project diff --git a/tajo-core/pom.xml b/tajo-core/pom.xml index 5c4d039711..1d4dc583b5 100644 --- a/tajo-core/pom.xml +++ b/tajo-core/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project tajo-core diff --git a/tajo-dist/pom.xml b/tajo-dist/pom.xml index 57ed18373d..b465c6f991 100644 --- a/tajo-dist/pom.xml +++ b/tajo-dist/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-jdbc/pom.xml b/tajo-jdbc/pom.xml index 89599371cd..b9e6930e8c 100644 --- a/tajo-jdbc/pom.xml +++ b/tajo-jdbc/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project tajo-jdbc diff --git a/tajo-maven-plugins/pom.xml b/tajo-maven-plugins/pom.xml index 0e667b6563..543e0f05eb 100644 --- a/tajo-maven-plugins/pom.xml +++ b/tajo-maven-plugins/pom.xml @@ -17,7 +17,7 @@ org.apache.tajo tajo-project - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project org.apache.tajo diff --git a/tajo-plan/pom.xml b/tajo-plan/pom.xml index e6e1eb2e16..cf5c4171b1 100644 --- a/tajo-plan/pom.xml +++ b/tajo-plan/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project tajo-plan diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml index 58b3b7d600..d0b2a9c407 100644 --- a/tajo-project/pom.xml +++ b/tajo-project/pom.xml @@ -22,7 +22,7 @@ org.apache.tajo tajo-main - 0.10.1 + 0.10.2-SNAPSHOT org.apache.tajo tajo-project @@ -35,7 +35,7 @@ UTF-8 2.6.0 2.5.0 - 0.10.1 + 0.10.2-SNAPSHOT 0.98.7-hadoop2 4.0.25.Final ${project.parent.relativePath}/.. @@ -326,7 +326,7 @@ https://git-wip-us.apache.org/repos/asf/tajo.git scm:git:https://git-wip-us.apache.org/repos/asf/tajo.git scm:git:https://git-wip-us.apache.org/repos/asf/tajo.git - release-0.10.1-rc0 + HEAD diff --git a/tajo-pullserver/pom.xml b/tajo-pullserver/pom.xml index ae174ac378..516b16495a 100644 --- a/tajo-pullserver/pom.xml +++ b/tajo-pullserver/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-rpc/pom.xml b/tajo-rpc/pom.xml index a1727e259d..fc57c0dff2 100644 --- a/tajo-rpc/pom.xml +++ b/tajo-rpc/pom.xml @@ -20,7 +20,7 @@ 4.0.0 tajo-project - 0.10.1 + 0.10.2-SNAPSHOT org.apache.tajo ../tajo-project diff --git a/tajo-storage/pom.xml b/tajo-storage/pom.xml index 4efac11434..faa1aaae10 100644 --- a/tajo-storage/pom.xml +++ b/tajo-storage/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-storage/tajo-storage-common/pom.xml b/tajo-storage/tajo-storage-common/pom.xml index f3884797a0..60b886a04c 100644 --- a/tajo-storage/tajo-storage-common/pom.xml +++ b/tajo-storage/tajo-storage-common/pom.xml @@ -21,7 +21,7 @@ limitations under the License. tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-storage/tajo-storage-hbase/pom.xml b/tajo-storage/tajo-storage-hbase/pom.xml index ebb8e24744..79a8b99b89 100644 --- a/tajo-storage/tajo-storage-hbase/pom.xml +++ b/tajo-storage/tajo-storage-hbase/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 985eb50e6f..b89bf4d6b0 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-thirdparty/asm/pom.xml b/tajo-thirdparty/asm/pom.xml index c02ebd6d51..1483611554 100644 --- a/tajo-thirdparty/asm/pom.xml +++ b/tajo-thirdparty/asm/pom.xml @@ -17,7 +17,7 @@ tajo-project org.apache.tajo - 0.10.1 + 0.10.2-SNAPSHOT ../../tajo-project From aa6651b74541e424eb6fc895f92c14838d8ca232 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Mon, 13 Apr 2015 08:14:15 +0900 Subject: [PATCH 043/141] initial ORC scanner --- .../apache/tajo/storage/orc/OrcScanner.java | 74 +++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java new file mode 100644 index 0000000000..279f3c6a46 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -0,0 +1,74 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.storage.FileScanner; +import org.apache.tajo.storage.Tuple; +import org.apache.tajo.storage.fragment.Fragment; + +import java.io.IOException; + +public class OrcScanner extends FileScanner { + + public OrcScanner(Configuration conf, final Schema schema, final TableMeta meta, final Fragment fragment) { + super(conf, schema, meta, fragment); + } + + @Override + public void init() throws IOException { + if (targets == null) { + targets = schema.toArray(); + } + + super.init(); + } + + @Override + public Tuple next() throws IOException { + return null; + } + + @Override + public void reset() throws IOException { + + } + + @Override + public void close() throws IOException { + + } + + @Override + public boolean isProjectable() { + return false; + } + + @Override + public boolean isSelectable() { + return false; + } + + @Override + public boolean isSplittable() { + return false; + } +} From 7b057d27b0b92a8282a6833e667cca8848f8691f Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 14 May 2015 15:49:18 +0900 Subject: [PATCH 044/141] ORC fundamental code importing from Presto --- tajo-storage/tajo-storage-hdfs/pom.xml | 53 ++ .../thirdparty/orc/AbstractOrcDataSource.java | 233 +++++++++ .../storage/thirdparty/orc/BooleanVector.java | 45 ++ .../storage/thirdparty/orc/DiskRange.java | 99 ++++ .../storage/thirdparty/orc/DoubleVector.java | 45 ++ .../thirdparty/orc/FileOrcDataSource.java | 49 ++ .../storage/thirdparty/orc/LongVector.java | 45 ++ .../storage/thirdparty/orc/ObjectVector.java | 37 ++ .../orc/OrcCorruptionException.java | 37 ++ .../storage/thirdparty/orc/OrcDataSource.java | 40 ++ .../thirdparty/orc/OrcDataSourceUtils.java | 90 ++++ .../storage/thirdparty/orc/OrcPredicate.java | 40 ++ .../tajo/storage/thirdparty/orc/RowGroup.java | 58 +++ .../storage/thirdparty/orc/SliceVector.java | 44 ++ .../thirdparty/orc/StreamDescriptor.java | 83 +++ .../tajo/storage/thirdparty/orc/StreamId.java | 76 +++ .../tajo/storage/thirdparty/orc/Stripe.java | 70 +++ .../storage/thirdparty/orc/StripeReader.java | 402 +++++++++++++++ .../tajo/storage/thirdparty/orc/Vector.java | 24 + .../checkpoint/BooleanStreamCheckpoint.java | 58 +++ .../checkpoint/ByteArrayStreamCheckpoint.java | 50 ++ .../orc/checkpoint/ByteStreamCheckpoint.java | 60 +++ .../orc/checkpoint/Checkpoints.java | 408 +++++++++++++++ .../checkpoint/DoubleStreamCheckpoint.java | 50 ++ .../orc/checkpoint/FloatStreamCheckpoint.java | 50 ++ .../orc/checkpoint/InputStreamCheckpoint.java | 64 +++ .../InvalidCheckpointException.java | 25 + .../orc/checkpoint/LongStreamCheckpoint.java | 19 + .../checkpoint/LongStreamDwrfCheckpoint.java | 50 ++ .../checkpoint/LongStreamV1Checkpoint.java | 60 +++ .../checkpoint/LongStreamV2Checkpoint.java | 60 +++ ...GroupDictionaryLengthStreamCheckpoint.java | 53 ++ .../orc/checkpoint/StreamCheckpoint.java | 18 + .../orc/metadata/BooleanStatistics.java | 29 ++ .../orc/metadata/ColumnEncoding.java | 57 ++ .../orc/metadata/ColumnStatistics.java | 74 +++ .../orc/metadata/CompressionKind.java | 19 + .../orc/metadata/DateStatistics.java | 39 ++ .../orc/metadata/DoubleStatistics.java | 39 ++ .../orc/metadata/DwrfMetadataReader.java | 373 ++++++++++++++ .../thirdparty/orc/metadata/Footer.java | 76 +++ .../orc/metadata/IntegerStatistics.java | 37 ++ .../thirdparty/orc/metadata/Metadata.java | 31 ++ .../orc/metadata/MetadataReader.java | 36 ++ .../orc/metadata/OrcMetadataReader.java | 487 ++++++++++++++++++ .../thirdparty/orc/metadata/OrcType.java | 105 ++++ .../thirdparty/orc/metadata/PostScript.java | 76 +++ .../orc/metadata/RangeStatistics.java | 20 + .../orc/metadata/RowGroupIndex.java | 42 ++ .../thirdparty/orc/metadata/Stream.java | 78 +++ .../orc/metadata/StringStatistics.java | 41 ++ .../thirdparty/orc/metadata/StripeFooter.java | 42 ++ .../orc/metadata/StripeInformation.java | 71 +++ .../orc/metadata/StripeStatistics.java | 35 ++ .../orc/reader/BooleanStreamReader.java | 160 ++++++ .../orc/reader/ByteStreamReader.java | 162 ++++++ .../orc/reader/DoubleStreamReader.java | 162 ++++++ .../orc/reader/FloatStreamReader.java | 163 ++++++ .../reader/LongDictionaryStreamReader.java | 219 ++++++++ .../orc/reader/LongDirectStreamReader.java | 162 ++++++ .../orc/reader/LongStreamReader.java | 88 ++++ .../thirdparty/orc/reader/OrcReaderUtils.java | 32 ++ .../reader/SliceDictionaryStreamReader.java | 286 ++++++++++ .../orc/reader/SliceDirectStreamReader.java | 211 ++++++++ .../orc/reader/SliceStreamReader.java | 88 ++++ .../thirdparty/orc/reader/StreamReader.java | 34 ++ .../orc/reader/TimestampStreamReader.java | 230 +++++++++ .../thirdparty/orc/stream/BooleanStream.java | 211 ++++++++ .../orc/stream/ByteArrayStream.java | 67 +++ .../thirdparty/orc/stream/ByteStream.java | 138 +++++ .../orc/stream/CheckpointStreamSource.java | 69 +++ .../thirdparty/orc/stream/DoubleStream.java | 104 ++++ .../thirdparty/orc/stream/FloatStream.java | 109 ++++ .../thirdparty/orc/stream/LongDecode.java | 178 +++++++ .../thirdparty/orc/stream/LongStream.java | 40 ++ .../thirdparty/orc/stream/LongStreamDwrf.java | 129 +++++ .../thirdparty/orc/stream/LongStreamV1.java | 188 +++++++ .../thirdparty/orc/stream/LongStreamV2.java | 456 ++++++++++++++++ .../orc/stream/MissingStreamSource.java | 46 ++ .../thirdparty/orc/stream/OrcInputStream.java | 295 +++++++++++ .../thirdparty/orc/stream/OrcStreamUtils.java | 66 +++ .../RowGroupDictionaryLengthStream.java | 52 ++ .../thirdparty/orc/stream/StreamSource.java | 26 + .../thirdparty/orc/stream/StreamSources.java | 56 ++ .../thirdparty/orc/stream/ValueStream.java | 29 ++ .../orc/stream/ValueStreamSource.java | 53 ++ .../thirdparty/orc/stream/ValueStreams.java | 146 ++++++ 87 files changed, 8857 insertions(+) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/AbstractOrcDataSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcPredicate.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RowGroup.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Vector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InvalidCheckpointException.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/StreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/BooleanStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnEncoding.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/CompressionKind.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DateStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DoubleStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Footer.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/IntegerStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Metadata.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/MetadataReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcType.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/PostScript.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RangeStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RowGroupIndex.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Stream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeFooter.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeInformation.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/OrcReaderUtils.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/BooleanStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/MissingStreamSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/RowGroupDictionaryLengthStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreamSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreams.java diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 3a59ec9f23..96da422964 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -354,6 +354,59 @@ net.minidev json-smart + + io.airlift + slice + 0.10 + + + io.airlift + units + 0.108 + + + com.google.guava + guava + 18.0 + + + joda-time + joda-time + 2.4 + + + org.iq80.snappy + snappy + 0.2 + + + com.facebook.presto.hive + hive-apache + 0.10 + + + com.facebook.hive + hive-dwrf + 0.8 + + + commons-logging + commons-logging + + + org.iq80.snappy + snappy + + + com.facebook.presto.hadoop + hadoop-cdh4 + + + it.unimi.dsi + fastutil + + + diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/AbstractOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/AbstractOrcDataSource.java new file mode 100644 index 0000000000..e726870b29 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/AbstractOrcDataSource.java @@ -0,0 +1,233 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.ImmutableMap; +import com.google.common.primitives.Ints; +import io.airlift.slice.*; +import io.airlift.slice.ChunkedSliceInput.BufferReference; +import io.airlift.slice.ChunkedSliceInput.SliceLoader; +import io.airlift.units.DataSize; + +import java.io.IOException; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Map.Entry; + +import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.getDiskRangeSlice; +import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +public abstract class AbstractOrcDataSource + implements OrcDataSource +{ + private final String name; + private final long size; + private final DataSize maxMergeDistance; + private final DataSize maxBufferSize; + private final DataSize streamBufferSize; + private long readTimeNanos; + + public AbstractOrcDataSource(String name, long size, DataSize maxMergeDistance, DataSize maxBufferSize, DataSize streamBufferSize) + { + this.name = checkNotNull(name, "name is null"); + + this.size = size; + checkArgument(size >= 0, "size is negative"); + + this.maxMergeDistance = checkNotNull(maxMergeDistance, "maxMergeDistance is null"); + this.maxBufferSize = checkNotNull(maxBufferSize, "maxBufferSize is null"); + this.streamBufferSize = checkNotNull(streamBufferSize, "streamBufferSize is null"); + } + + protected abstract void readInternal(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException; + + @Override + public final long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public final long getSize() + { + return size; + } + + @Override + public final void readFully(long position, byte[] buffer) + throws IOException + { + readFully(position, buffer, 0, buffer.length); + } + + @Override + public final void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException + { + long start = System.nanoTime(); + + readInternal(position, buffer, bufferOffset, bufferLength); + + readTimeNanos += System.nanoTime() - start; + } + + @Override + public final Map readFully(Map diskRanges) + throws IOException + { + checkNotNull(diskRanges, "diskRanges is null"); + + if (diskRanges.isEmpty()) { + return ImmutableMap.of(); + } + + // + // Note: this code does not use the Java 8 stream APIs to avoid any extra object allocation + // + + // split disk ranges into "big" and "small" + long maxReadSizeBytes = maxBufferSize.toBytes(); + ImmutableMap.Builder smallRangesBuilder = ImmutableMap.builder(); + ImmutableMap.Builder largeRangesBuilder = ImmutableMap.builder(); + for (Entry entry : diskRanges.entrySet()) { + if (entry.getValue().getLength() <= maxReadSizeBytes) { + smallRangesBuilder.put(entry); + } + else { + largeRangesBuilder.put(entry); + } + } + Map smallRanges = smallRangesBuilder.build(); + Map largeRanges = largeRangesBuilder.build(); + + // read ranges + ImmutableMap.Builder slices = ImmutableMap.builder(); + slices.putAll(readSmallDiskRanges(smallRanges)); + slices.putAll(readLargeDiskRanges(largeRanges)); + + return slices.build(); + } + + private Map readSmallDiskRanges(Map diskRanges) + throws IOException + { + if (diskRanges.isEmpty()) { + return ImmutableMap.of(); + } + + Iterable mergedRanges = mergeAdjacentDiskRanges(diskRanges.values(), maxMergeDistance, maxBufferSize); + + // read ranges + Map buffers = new LinkedHashMap(); + for (DiskRange mergedRange : mergedRanges) { + // read full range in one request + byte[] buffer = new byte[mergedRange.getLength()]; + readFully(mergedRange.getOffset(), buffer); + buffers.put(mergedRange, buffer); + } + + ImmutableMap.Builder slices = ImmutableMap.builder(); + for (Entry entry : diskRanges.entrySet()) { + slices.put(entry.getKey(), getDiskRangeSlice(entry.getValue(), buffers).getInput()); + } + return slices.build(); + } + + private Map readLargeDiskRanges(Map diskRanges) + throws IOException + { + if (diskRanges.isEmpty()) { + return ImmutableMap.of(); + } + + ImmutableMap.Builder slices = ImmutableMap.builder(); + for (Entry entry : diskRanges.entrySet()) { + ChunkedSliceInput sliceInput = new ChunkedSliceInput(new HdfsSliceLoader(entry.getValue()), Ints.checkedCast(streamBufferSize.toBytes())); + slices.put(entry.getKey(), sliceInput); + } + return slices.build(); + } + + @Override + public final String toString() + { + return name; + } + + private class HdfsSliceLoader + implements SliceLoader + { + private final DiskRange diskRange; + + public HdfsSliceLoader(DiskRange diskRange) + { + this.diskRange = diskRange; + } + + @Override + public SliceBufferReference createBuffer(int bufferSize) + { + return new SliceBufferReference(bufferSize); + } + + @Override + public long getSize() + { + return diskRange.getLength(); + } + + @Override + public void load(long position, SliceBufferReference bufferReference, int length) + { + try { + readFully(diskRange.getOffset() + position, bufferReference.getBuffer(), 0, length); + } + catch (IOException e) { + new RuntimeIOException(e); + } + } + + @Override + public void close() + { + } + } + + private static class SliceBufferReference + implements BufferReference + { + private final byte[] buffer; + private final Slice slice; + + public SliceBufferReference(int bufferSize) + { + this.buffer = new byte[bufferSize]; + this.slice = Slices.wrappedBuffer(buffer); + } + + public byte[] getBuffer() + { + return buffer; + } + + @Override + public Slice getSlice() + { + return slice; + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java new file mode 100644 index 0000000000..ae62d407ac --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java @@ -0,0 +1,45 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; + +public class BooleanVector + implements Vector +{ + public final boolean[] isNull; + public final boolean[] vector; + + public BooleanVector(int length) + { + if (length > MAX_VECTOR_LENGTH) { + throw new IllegalArgumentException("length greater than max vector length"); + } + isNull = new boolean[length]; + vector = new boolean[length]; + } + + @Override + @VisibleForTesting + public ObjectVector toObjectVector(int size) + { + ObjectVector objectVector = new ObjectVector(vector.length); + for (int i = 0; i < size; i++) { + if (!isNull[i]) { + objectVector.vector[i] = vector[i]; + } + } + return objectVector; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java new file mode 100644 index 0000000000..fdd47556ce --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java @@ -0,0 +1,99 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.primitives.Ints; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +public final class DiskRange +{ + private final long offset; + private final int length; + + public DiskRange(long offset, int length) + { + checkArgument(offset >= 0, "offset is negative"); + checkArgument(length >= 0, "length is negative"); + + this.offset = offset; + this.length = length; + } + + public long getOffset() + { + return offset; + } + + public int getLength() + { + return length; + } + + public long getEnd() + { + return offset + length; + } + + public boolean contains(DiskRange diskRange) + { + return offset <= diskRange.getOffset() && diskRange.getEnd() <= getEnd(); + } + + /** + * Returns the minimal DiskRange that encloses both this DiskRange + * and otherDiskRange. If there was a gap between the ranges the + * new range will cover that gap. + */ + public DiskRange span(DiskRange otherDiskRange) + { + checkNotNull(otherDiskRange, "otherDiskRange is null"); + long start = Math.min(this.offset, otherDiskRange.getOffset()); + long end = Math.max(getEnd(), otherDiskRange.getEnd()); + return new DiskRange(start, Ints.checkedCast(end - start)); + } + + @Override + public int hashCode() + { + return Objects.hash(offset, length); + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + DiskRange other = (DiskRange) obj; + return Objects.equals(this.offset, other.offset) + && Objects.equals(this.length, other.length); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("offset", offset) + .add("length", length) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java new file mode 100644 index 0000000000..ba40c493b4 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java @@ -0,0 +1,45 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; + +public class DoubleVector + implements Vector +{ + public final boolean[] isNull; + public final double[] vector; + + public DoubleVector(int length) + { + if (length > MAX_VECTOR_LENGTH) { + throw new IllegalArgumentException("length greater than max vector length"); + } + vector = new double[length]; + isNull = new boolean[length]; + } + + @Override + @VisibleForTesting + public ObjectVector toObjectVector(int size) + { + ObjectVector objectVector = new ObjectVector(vector.length); + for (int i = 0; i < size; i++) { + if (!isNull[i]) { + objectVector.vector[i] = vector[i]; + } + } + return objectVector; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java new file mode 100644 index 0000000000..5325d4b48f --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java @@ -0,0 +1,49 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import io.airlift.units.DataSize; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.RandomAccessFile; + +public class FileOrcDataSource + extends AbstractOrcDataSource +{ + private final RandomAccessFile input; + + public FileOrcDataSource(File path, DataSize maxMergeDistance, DataSize maxReadSize, DataSize streamBufferSize) + throws FileNotFoundException + { + super(path.getPath(), path.length(), maxMergeDistance, maxReadSize, streamBufferSize); + this.input = new RandomAccessFile(path, "r"); + } + + @Override + public void close() + throws IOException + { + input.close(); + } + + @Override + protected void readInternal(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException + { + input.seek(position); + input.readFully(buffer, bufferOffset, bufferLength); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java new file mode 100644 index 0000000000..1c4834eca1 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java @@ -0,0 +1,45 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; + +public class LongVector + implements Vector +{ + public final boolean[] isNull; + public final long[] vector; + + public LongVector(int length) + { + if (length > MAX_VECTOR_LENGTH) { + throw new IllegalArgumentException("length greater than max vector length"); + } + vector = new long[length]; + isNull = new boolean[length]; + } + + @Override + @VisibleForTesting + public ObjectVector toObjectVector(int size) + { + ObjectVector objectVector = new ObjectVector(vector.length); + for (int i = 0; i < size; i++) { + if (!isNull[i]) { + objectVector.vector[i] = vector[i]; + } + } + return objectVector; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java new file mode 100644 index 0000000000..7419ebb8d7 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java @@ -0,0 +1,37 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; + +public class ObjectVector + implements Vector +{ + public final Object[] vector; + + public ObjectVector(int length) + { + if (length > MAX_VECTOR_LENGTH) { + throw new IllegalArgumentException("length greater than max vector length"); + } + vector = new Object[length]; + } + + @Override + @VisibleForTesting + public ObjectVector toObjectVector(int size) + { + return this; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java new file mode 100644 index 0000000000..db2f6b6b5f --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java @@ -0,0 +1,37 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import java.io.IOException; + +import static java.lang.String.format; + +public class OrcCorruptionException + extends IOException +{ + public OrcCorruptionException(String message) + { + super(message); + } + + public OrcCorruptionException(String messageFormat, Object... args) + { + super(format(messageFormat, args)); + } + + public OrcCorruptionException(Throwable cause, String messageFormat, Object... args) + { + super(format(messageFormat, args), cause); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java new file mode 100644 index 0000000000..c577ccb5f7 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java @@ -0,0 +1,40 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import io.airlift.slice.FixedLengthSliceInput; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Map; + +public interface OrcDataSource + extends Closeable +{ + long getReadTimeNanos(); + + long getSize(); + + void readFully(long position, byte[] buffer) + throws IOException; + + void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException; + + Map readFully(Map diskRanges) + throws IOException; + + @Override + void close() throws IOException; +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java new file mode 100644 index 0000000000..600a1425fa --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java @@ -0,0 +1,90 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.ImmutableList; +import com.google.common.primitives.Ints; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.airlift.units.DataSize; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import static com.google.common.collect.Lists.newArrayList; + +public final class OrcDataSourceUtils +{ + private OrcDataSourceUtils() + { + } + + /** + * Merge disk ranges that are closer than {@code maxMergeDistance}. + */ + public static List mergeAdjacentDiskRanges(Iterable diskRanges, DataSize maxMergeDistance, DataSize maxReadSize) + { + // sort ranges by start offset + List ranges = newArrayList(diskRanges); + Collections.sort(ranges, new Comparator() + { + @Override + public int compare(DiskRange o1, DiskRange o2) + { + return Long.compare(o1.getOffset(), o2.getOffset()); + } + }); + + // merge overlapping ranges + long maxReadSizeBytes = maxReadSize.toBytes(); + long maxMergeDistanceBytes = maxMergeDistance.toBytes(); + ImmutableList.Builder result = ImmutableList.builder(); + DiskRange last = ranges.get(0); + for (int i = 1; i < ranges.size(); i++) { + DiskRange current = ranges.get(i); + DiskRange merged = last.span(current); + if (merged.getLength() <= maxReadSizeBytes && last.getEnd() + maxMergeDistanceBytes >= current.getOffset()) { + last = merged; + } + else { + result.add(last); + last = current; + } + } + result.add(last); + + return result.build(); + } + + /** + * Get a slice for the disk range from the provided buffers. The buffers ranges do not have + * to exactly match {@code diskRange}, but {@code diskRange} must be completely contained within + * one of the buffer ranges. + */ + public static Slice getDiskRangeSlice(DiskRange diskRange, Map buffers) + { + for (Entry bufferEntry : buffers.entrySet()) { + DiskRange bufferRange = bufferEntry.getKey(); + byte[] buffer = bufferEntry.getValue(); + if (bufferRange.contains(diskRange)) { + int offset = Ints.checkedCast(diskRange.getOffset() - bufferRange.getOffset()); + return Slices.wrappedBuffer(buffer, offset, diskRange.getLength()); + } + } + throw new IllegalStateException("No matching buffer for disk range"); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcPredicate.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcPredicate.java new file mode 100644 index 0000000000..b071056f58 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcPredicate.java @@ -0,0 +1,40 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnStatistics; + +import java.util.Map; + +public interface OrcPredicate +{ + OrcPredicate TRUE = new OrcPredicate() + { + @Override + public boolean matches(long numberOfRows, Map statisticsByColumnIndex) + { + return true; + } + }; + + /** + * Should the ORC reader process a file section with the specified statistics. + * + * @param numberOfRows the number of rows in the segment; this can be used with + * {@code ColumnStatistics} to determine if a column is only null + * @param statisticsByColumnIndex statistics for column by ordinal position + * in the file; this will match the field order from the hive metastore + */ + boolean matches(long numberOfRows, Map statisticsByColumnIndex); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RowGroup.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RowGroup.java new file mode 100644 index 0000000000..a919cd69e1 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RowGroup.java @@ -0,0 +1,58 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class RowGroup +{ + private final int groupId; + private final long rowCount; + private final StreamSources streamSources; + + public RowGroup(int groupId, long rowCount, StreamSources streamSources) + { + this.groupId = groupId; + this.rowCount = rowCount; + this.streamSources = checkNotNull(streamSources, "streamSources is null"); + } + + public int getGroupId() + { + return groupId; + } + + public long getRowCount() + { + return rowCount; + } + + public StreamSources getStreamSources() + { + return streamSources; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("groupId", groupId) + .add("rowCount", rowCount) + .add("streamSources", streamSources) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java new file mode 100644 index 0000000000..e1a696fdc5 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java @@ -0,0 +1,44 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; +import io.airlift.slice.Slice; + +public class SliceVector + implements Vector +{ + public final Slice[] vector; + + public SliceVector(int length) + { + if (length > MAX_VECTOR_LENGTH) { + throw new IllegalArgumentException("length greater than max vector length"); + } + vector = new Slice[length]; + } + + @Override + @VisibleForTesting + public ObjectVector toObjectVector(int size) + { + ObjectVector objectVector = new ObjectVector(vector.length); + for (int i = 0; i < size; i++) { + if (vector[i] != null) { + objectVector.vector[i] = vector[i]; + } + } + return objectVector; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java new file mode 100644 index 0000000000..9a5b53b464 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public final class StreamDescriptor +{ + private final String streamName; + private final int streamId; + private final OrcTypeKind streamType; + private final String fieldName; + private final OrcDataSource fileInput; + private final List nestedStreams; + + public StreamDescriptor(String streamName, int streamId, String fieldName, OrcTypeKind streamType, OrcDataSource fileInput, List nestedStreams) + { + this.streamName = checkNotNull(streamName, "streamName is null"); + this.streamId = streamId; + this.fieldName = checkNotNull(fieldName, "fieldName is null"); + this.streamType = checkNotNull(streamType, "type is null"); + this.fileInput = checkNotNull(fileInput, "fileInput is null"); + this.nestedStreams = ImmutableList.copyOf(checkNotNull(nestedStreams, "nestedStreams is null")); + } + + public String getStreamName() + { + return streamName; + } + + public int getStreamId() + { + return streamId; + } + + public OrcTypeKind getStreamType() + { + return streamType; + } + + public String getFieldName() + { + return fieldName; + } + + public OrcDataSource getFileInput() + { + return fileInput; + } + + public List getNestedStreams() + { + return nestedStreams; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("streamName", streamName) + .add("streamId", streamId) + .add("streamType", streamType) + .add("path", fileInput) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java new file mode 100644 index 0000000000..08afe28fd1 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java @@ -0,0 +1,76 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; + +import static com.google.common.base.MoreObjects.toStringHelper; + +public final class StreamId +{ + private final int column; + private final StreamKind streamKind; + + public StreamId(Stream stream) + { + this.column = stream.getColumn(); + this.streamKind = stream.getStreamKind(); + } + + public StreamId(int column, StreamKind streamKind) + { + this.column = column; + this.streamKind = streamKind; + } + + public int getColumn() + { + return column; + } + + public StreamKind getStreamKind() + { + return streamKind; + } + + @Override + public int hashCode() + { + return 31 * column + streamKind.hashCode(); + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + + StreamId other = (StreamId) obj; + return column == other.column && streamKind == other.streamKind; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("column", column) + .add("streamKind", streamKind) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java new file mode 100644 index 0000000000..54aa513a35 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java @@ -0,0 +1,70 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class Stripe +{ + private final long rowCount; + private final List columnEncodings; + private final List rowGroups; + private final StreamSources dictionaryStreamSources; + + public Stripe(long rowCount, List columnEncodings, List rowGroups, StreamSources dictionaryStreamSources) + { + this.rowCount = rowCount; + this.columnEncodings = checkNotNull(columnEncodings, "columnEncodings is null"); + this.rowGroups = ImmutableList.copyOf(checkNotNull(rowGroups, "rowGroups is null")); + this.dictionaryStreamSources = checkNotNull(dictionaryStreamSources, "dictionaryStreamSources is null"); + } + + public long getRowCount() + { + return rowCount; + } + + public List getColumnEncodings() + { + return columnEncodings; + } + + public List getRowGroups() + { + return rowGroups; + } + + public StreamSources getDictionaryStreamSources() + { + return dictionaryStreamSources; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("rowCount", rowCount) + .add("columnEncodings", columnEncodings) + .add("rowGroups", rowGroups) + .add("dictionaryStreams", dictionaryStreamSources) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java new file mode 100644 index 0000000000..6cc26c666b --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java @@ -0,0 +1,402 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.InvalidCheckpointException; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.metadata.*; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; +import org.apache.tajo.storage.thirdparty.orc.stream.*; +import com.google.common.base.Predicates; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; +import com.google.common.primitives.Ints; +import io.airlift.slice.FixedLengthSliceInput; +import io.airlift.slice.Slices; + +import java.io.IOException; +import java.io.InputStream; +import java.util.*; +import java.util.Map.Entry; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.getStreamCheckpoints; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.CheckpointStreamSource.createCheckpointStreamSource; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +public class StripeReader +{ + private final OrcDataSource orcDataSource; + private final CompressionKind compressionKind; + private final List types; + private final int bufferSize; + private final Set includedOrcColumns; + private final int rowsInRowGroup; + private final OrcPredicate predicate; + private final MetadataReader metadataReader; + + public StripeReader(OrcDataSource orcDataSource, + CompressionKind compressionKind, + List types, + int bufferSize, + Set includedColumns, + int rowsInRowGroup, + OrcPredicate predicate, + MetadataReader metadataReader) + { + this.orcDataSource = checkNotNull(orcDataSource, "orcDataSource is null"); + this.compressionKind = checkNotNull(compressionKind, "compressionKind is null"); + this.types = ImmutableList.copyOf(checkNotNull(types, "types is null")); + this.bufferSize = bufferSize; + this.includedOrcColumns = getIncludedOrcColumns(types, checkNotNull(includedColumns, "includedColumns is null")); + this.rowsInRowGroup = rowsInRowGroup; + this.predicate = checkNotNull(predicate, "predicate is null"); + this.metadataReader = checkNotNull(metadataReader, "metadataReader is null"); + } + + public Stripe readStripe(StripeInformation stripe) + throws IOException + { + // read the stripe footer + StripeFooter stripeFooter = readStripeFooter(stripe); + List columnEncodings = stripeFooter.getColumnEncodings(); + + // get streams for selected columns + Map streams = new HashMap(); + boolean hasRowGroupDictionary = false; + for (Stream stream : stripeFooter.getStreams()) { + if (includedOrcColumns.contains(stream.getColumn())) { + streams.put(new StreamId(stream), stream); + + ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind(); + if (columnEncoding == DICTIONARY && stream.getStreamKind() == StreamKind.IN_DICTIONARY) { + hasRowGroupDictionary = true; + } + } + } + + if (stripe.getNumberOfRows() > 10000 || hasRowGroupDictionary) { + // determine ranges of the stripe to read + Map diskRanges = getDiskRanges(stripeFooter.getStreams()); + diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet())); + + // read the file regions + Map streamsData = readDiskRanges(stripe.getOffset(), diskRanges); + + // read the row index for each column + Map> columnIndexes = readColumnIndexes(streams, streamsData); + + // select the row groups matching the tuple domain + Set selectedRowGroups = selectRowGroups(stripe, columnIndexes); + + // if all row groups are skipped, return null + if (selectedRowGroups.isEmpty()) { + return null; + } + + // value streams + Map> valueStreams = createValueStreams(streams, streamsData, columnEncodings); + + // build the dictionary streams + StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings); + + // build the row groups + try { + List rowGroups = createRowGroups( + stripe.getNumberOfRows(), + streams, + valueStreams, + columnIndexes, + selectedRowGroups, + columnEncodings); + + return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources); + } + catch (InvalidCheckpointException e) { + // The ORC file contains a corrupt checkpoint stream + // If the file does not have a row group dictionary, treat the stripe as a single row group. Otherwise, + // we must fail because the length of the row group dictionary is contained in the checkpoint stream. + if (hasRowGroupDictionary) { + throw new OrcCorruptionException(e, "ORC file %s has corrupt checkpoints", orcDataSource); + } + } + } + + ImmutableMap.Builder diskRangesBuilder = ImmutableMap.builder(); + for (Entry entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) { + StreamId streamId = entry.getKey(); + if (streamId.getStreamKind() != ROW_INDEX && streams.keySet().contains(streamId)) { + diskRangesBuilder.put(entry); + } + } + ImmutableMap diskRanges = diskRangesBuilder.build(); + + // read the file regions + Map streamsData = readDiskRanges(stripe.getOffset(), diskRanges); + + // value streams + Map> valueStreams = createValueStreams(streams, streamsData, columnEncodings); + + // build the dictionary streams + StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings); + + // build the row group + ImmutableMap.Builder> builder = ImmutableMap.builder(); + for (Entry> entry : valueStreams.entrySet()) { + builder.put(entry.getKey(), new ValueStreamSource>(entry.getValue())); + } + RowGroup rowGroup = new RowGroup(0, stripe.getNumberOfRows(), new StreamSources(builder.build())); + + return new Stripe(stripe.getNumberOfRows(), columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources); + } + + public Map readDiskRanges(long stripeOffset, Map diskRanges) + throws IOException + { + // + // Note: this code does not use the Java 8 stream APIs to avoid any extra object allocation + // + + // transform ranges to have an absolute offset in file + ImmutableMap.Builder diskRangesBuilder = ImmutableMap.builder(); + for (Entry entry : diskRanges.entrySet()) { + DiskRange diskRange = entry.getValue(); + diskRangesBuilder.put(entry.getKey(), new DiskRange(stripeOffset + diskRange.getOffset(), diskRange.getLength())); + } + diskRanges = diskRangesBuilder.build(); + + // read ranges + Map streamsData = orcDataSource.readFully(diskRanges); + + // transform streams to OrcInputStream + String sourceName = orcDataSource.toString(); + ImmutableMap.Builder streamsBuilder = ImmutableMap.builder(); + for (Entry entry : streamsData.entrySet()) { + streamsBuilder.put(entry.getKey(), new OrcInputStream(sourceName, entry.getValue(), compressionKind, bufferSize)); + } + return streamsBuilder.build(); + } + + private Map> createValueStreams(Map streams, Map streamsData, List columnEncodings) + { + ImmutableMap.Builder> valueStreams = ImmutableMap.builder(); + for (Entry entry : streams.entrySet()) { + StreamId streamId = entry.getKey(); + Stream stream = entry.getValue(); + ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind(); + + // skip index and empty streams + if (isIndexStream(stream) || stream.getLength() == 0) { + continue; + } + + OrcInputStream inputStream = streamsData.get(streamId); + OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); + + valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts())); + } + return valueStreams.build(); + } + + public StreamSources createDictionaryStreamSources(Map streams, Map> valueStreams, List columnEncodings) + { + ImmutableMap.Builder> dictionaryStreamBuilder = ImmutableMap.builder(); + for (Entry entry : streams.entrySet()) { + StreamId streamId = entry.getKey(); + Stream stream = entry.getValue(); + int column = stream.getColumn(); + + // only process dictionary streams + ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind(); + if (!isDictionary(stream, columnEncoding)) { + continue; + } + + // skip streams without data + ValueStream valueStream = valueStreams.get(streamId); + if (valueStream == null) { + continue; + } + + OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); + StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding); + + StreamSource streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint); + dictionaryStreamBuilder.put(streamId, streamSource); + } + return new StreamSources(dictionaryStreamBuilder.build()); + } + + private List createRowGroups( + int rowsInStripe, + Map streams, + Map> valueStreams, + Map> columnIndexes, + Set selectedRowGroups, + List encodings) + throws InvalidCheckpointException + { + ImmutableList.Builder rowGroupBuilder = ImmutableList.builder(); + + for (int rowGroupId : selectedRowGroups) { + Map checkpoints = getStreamCheckpoints(includedOrcColumns, types, compressionKind, rowGroupId, encodings, streams, columnIndexes); + int rowsInGroup = Math.min(rowsInStripe - (rowGroupId * rowsInRowGroup), rowsInRowGroup); + rowGroupBuilder.add(createRowGroup(rowGroupId, rowsInGroup, valueStreams, checkpoints)); + } + + return rowGroupBuilder.build(); + } + + public static RowGroup createRowGroup(int groupId, int rowCount, Map> valueStreams, Map checkpoints) + { + ImmutableMap.Builder> builder = ImmutableMap.builder(); + for (Entry entry : checkpoints.entrySet()) { + StreamId streamId = entry.getKey(); + StreamCheckpoint checkpoint = entry.getValue(); + + // skip streams without data + ValueStream valueStream = valueStreams.get(streamId); + if (valueStream == null) { + continue; + } + + builder.put(streamId, createCheckpointStreamSource(valueStream, checkpoint)); + } + StreamSources rowGroupStreams = new StreamSources(builder.build()); + return new RowGroup(groupId, rowCount, rowGroupStreams); + } + + public StripeFooter readStripeFooter(StripeInformation stripe) + throws IOException + { + long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(); + int tailLength = Ints.checkedCast(stripe.getFooterLength()); + + // read the footer + byte[] tailBuffer = new byte[tailLength]; + orcDataSource.readFully(offset, tailBuffer); + InputStream inputStream = new OrcInputStream(orcDataSource.toString(), Slices.wrappedBuffer(tailBuffer).getInput(), compressionKind, bufferSize); + return metadataReader.readStripeFooter(types, inputStream); + } + + private Map> readColumnIndexes(Map streams, Map streamsData) + throws IOException + { + ImmutableMap.Builder> columnIndexes = ImmutableMap.builder(); + for (Entry entry : streams.entrySet()) { + Stream stream = entry.getValue(); + if (stream.getStreamKind() == ROW_INDEX) { + OrcInputStream inputStream = streamsData.get(entry.getKey()); + columnIndexes.put(stream.getColumn(), metadataReader.readRowIndexes(inputStream)); + } + } + return columnIndexes.build(); + } + + private Set selectRowGroups(StripeInformation stripe, Map> columnIndexes) + throws IOException + { + int rowsInStripe = Ints.checkedCast(stripe.getNumberOfRows()); + int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup); + + ImmutableSet.Builder selectedRowGroups = ImmutableSet.builder(); + int remainingRows = rowsInStripe; + for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) { + int rows = Math.min(remainingRows, rowsInRowGroup); + Map statistics = getRowGroupStatistics(types.get(0), columnIndexes, rowGroup); + if (predicate.matches(rows, statistics)) { + selectedRowGroups.add(rowGroup); + } + remainingRows -= rows; + } + return selectedRowGroups.build(); + } + + private static Map getRowGroupStatistics(OrcType rootStructType, Map> columnIndexes, int rowGroup) + { + checkNotNull(rootStructType, "rootStructType is null"); + checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); + checkNotNull(columnIndexes, "columnIndexes is null"); + checkArgument(rowGroup >= 0, "rowGroup is negative"); + + ImmutableMap.Builder statistics = ImmutableMap.builder(); + for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { + List rowGroupIndexes = columnIndexes.get(rootStructType.getFieldTypeIndex(ordinal)); + if (rowGroupIndexes != null) { + statistics.put(ordinal, rowGroupIndexes.get(rowGroup).getColumnStatistics()); + } + } + return statistics.build(); + } + + private static boolean isIndexStream(Stream stream) + { + return stream.getStreamKind() == ROW_INDEX || stream.getStreamKind() == DICTIONARY_COUNT; + } + + private static boolean isDictionary(Stream stream, ColumnEncodingKind columnEncoding) + { + return stream.getStreamKind() == DICTIONARY_DATA || (stream.getStreamKind() == LENGTH && (columnEncoding == DICTIONARY || columnEncoding == DICTIONARY_V2)); + } + + private static Map getDiskRanges(List streams) + { + ImmutableMap.Builder streamDiskRanges = ImmutableMap.builder(); + long stripeOffset = 0; + for (Stream stream : streams) { + int streamLength = Ints.checkedCast(stream.getLength()); + streamDiskRanges.put(new StreamId(stream), new DiskRange(stripeOffset, streamLength)); + stripeOffset += streamLength; + } + return streamDiskRanges.build(); + } + + private static Set getIncludedOrcColumns(List types, Set includedColumns) + { + Set includes = new LinkedHashSet(); + + OrcType root = types.get(0); + for (int includedColumn : includedColumns) { + includeOrcColumnsRecursive(types, includes, root.getFieldTypeIndex(includedColumn)); + } + + return includes; + } + + private static void includeOrcColumnsRecursive(List types, Set result, int typeId) + { + result.add(typeId); + OrcType type = types.get(typeId); + int children = type.getFieldCount(); + for (int i = 0; i < children; ++i) { + includeOrcColumnsRecursive(types, result, type.getFieldTypeIndex(i)); + } + } + + /** + * Ceiling of integer division + */ + private static int ceil(int dividend, int divisor) + { + return ((dividend + divisor) - 1) / divisor; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Vector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Vector.java new file mode 100644 index 0000000000..e655ac416a --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Vector.java @@ -0,0 +1,24 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; + +public interface Vector +{ + int MAX_VECTOR_LENGTH = 1024; + + @VisibleForTesting + ObjectVector toObjectVector(int size); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java new file mode 100644 index 0000000000..f5396dfee1 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java @@ -0,0 +1,58 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import com.google.common.base.MoreObjects; + +import static com.google.common.base.Preconditions.checkNotNull; + +public final class BooleanStreamCheckpoint + implements StreamCheckpoint +{ + private final int offset; + private final ByteStreamCheckpoint byteStreamCheckpoint; + + public BooleanStreamCheckpoint(int offset, ByteStreamCheckpoint byteStreamCheckpoint) + { + this.offset = offset; + this.byteStreamCheckpoint = checkNotNull(byteStreamCheckpoint, "byteStreamCheckpoint is null"); + } + + public BooleanStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + byteStreamCheckpoint = new ByteStreamCheckpoint(compressionKind, positionsList); + offset = positionsList.nextPosition(); + } + + public int getOffset() + { + return offset; + } + + public ByteStreamCheckpoint getByteStreamCheckpoint() + { + return byteStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("offset", offset) + .add("byteStreamCheckpoint", byteStreamCheckpoint) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java new file mode 100644 index 0000000000..0be5955fda --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java @@ -0,0 +1,50 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import com.google.common.base.MoreObjects; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class ByteArrayStreamCheckpoint + implements StreamCheckpoint +{ + private final long inputStreamCheckpoint; + + public ByteArrayStreamCheckpoint(long inputStreamCheckpoint) + { + this.inputStreamCheckpoint = inputStreamCheckpoint; + } + + public ByteArrayStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java new file mode 100644 index 0000000000..9a12b14d9d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import com.google.common.base.MoreObjects; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; +import static com.google.common.base.Preconditions.checkNotNull; + +public final class ByteStreamCheckpoint + implements StreamCheckpoint +{ + private final int offset; + private final long inputStreamCheckpoint; + + public ByteStreamCheckpoint(int offset, long inputStreamCheckpoint) + { + this.offset = offset; + this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); + } + + public ByteStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + offset = positionsList.nextPosition(); + } + + public int getOffset() + { + return offset; + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("offset", offset) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java new file mode 100644 index 0000000000..847d950e35 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java @@ -0,0 +1,408 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import org.apache.tajo.storage.thirdparty.orc.StreamId; +import org.apache.tajo.storage.thirdparty.orc.metadata.*; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; +import com.google.common.collect.*; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Predicates.equalTo; + +public final class Checkpoints +{ + private Checkpoints() + { + } + + public static Map getStreamCheckpoints( + Set columns, + List columnTypes, + CompressionKind compressionKind, + int rowGroupId, + List columnEncodings, + Map streams, + Map> columnIndexes) + throws InvalidCheckpointException + { + ImmutableSetMultimap.Builder streamKindsBuilder = ImmutableSetMultimap.builder(); + for (Stream stream : streams.values()) { + streamKindsBuilder.put(stream.getColumn(), stream.getStreamKind()); + } + SetMultimap streamKinds = streamKindsBuilder.build(); + + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + for (int column : columns) { + List positionsList = columnIndexes.get(column).get(rowGroupId).getPositions(); + + ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind(); + OrcTypeKind columnType = columnTypes.get(column).getOrcTypeKind(); + Set availableStreams = streamKinds.get(column); + + ColumnPositionsList columnPositionsList = new ColumnPositionsList(column, columnType, positionsList); + switch (columnType) { + case BOOLEAN: + checkpoints.putAll(getBooleanColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); + break; + case BYTE: + checkpoints.putAll(getByteColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); + break; + case SHORT: + case INT: + case LONG: + case DATE: + checkpoints.putAll(getLongColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); + break; + case FLOAT: + checkpoints.putAll(getFloatColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); + break; + case DOUBLE: + checkpoints.putAll(getDoubleColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); + break; + case TIMESTAMP: + checkpoints.putAll(getTimestampColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); + break; + case BINARY: + case STRING: + checkpoints.putAll(getSliceColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); + break; + case LIST: + case MAP: + checkpoints.putAll(getListOrMapColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); + break; + case STRUCT: + checkpoints.putAll(getStructColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); + break; + case DECIMAL: + case CHAR: + case VARCHAR: + case UNION: + throw new IllegalArgumentException("Unsupported column type " + columnType); + } + + // The DWRF code is not meticulous in the handling of checkpoints. It appears that for the first row group + // it will write checkpoints for all streams, but in other cases it will write only the streams that exist. + // We detect this case by checking that all offsets in the initial position list are zero, and if so, we + // clear the extra offsets + if (columnPositionsList.hasNextPosition() && !Iterables.all(positionsList, equalTo(0))) { + throw new InvalidCheckpointException(String.format("Column %s, of type %s, contains %s offset positions, but only %s positions were consumed", + column, + columnType, + positionsList.size(), + columnPositionsList.getIndex())); + } + } + return checkpoints.build(); + } + + public static StreamCheckpoint getDictionaryStreamCheckpoint(StreamId streamId, OrcTypeKind columnType, ColumnEncodingKind columnEncoding) + { + if (streamId.getStreamKind() == DICTIONARY_DATA) { + switch (columnType) { + case SHORT: + case INT: + case LONG: + return new LongStreamDwrfCheckpoint(createInputStreamCheckpoint(0, 0)); + case STRING: + case VARCHAR: + case CHAR: + case BINARY: + return new ByteArrayStreamCheckpoint(createInputStreamCheckpoint(0, 0)); + } + } + + // dictionary length and data streams are unsigned long streams + if (streamId.getStreamKind() == LENGTH || streamId.getStreamKind() == DATA) { + if (columnEncoding == DICTIONARY_V2) { + return new LongStreamV2Checkpoint(0, createInputStreamCheckpoint(0, 0)); + } + else if (columnEncoding == DICTIONARY) { + return new LongStreamV1Checkpoint(0, createInputStreamCheckpoint(0, 0)); + } + } + throw new IllegalArgumentException("Unsupported column type " + columnType + " for dictionary stream " + streamId); + } + + private static Map getBooleanColumnCheckpoints( + int column, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getByteColumnCheckpoints( + int column, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), new ByteStreamCheckpoint(compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getLongColumnCheckpoints( + int column, + ColumnEncodingKind encoding, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(IN_DICTIONARY)) { + checkpoints.put(new StreamId(column, IN_DICTIONARY), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getFloatColumnCheckpoints( + int column, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), new FloatStreamCheckpoint(compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getDoubleColumnCheckpoints( + int column, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), new DoubleStreamCheckpoint(compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getTimestampColumnCheckpoints( + int column, + ColumnEncodingKind encoding, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + + if (availableStreams.contains(SECONDARY)) { + checkpoints.put(new StreamId(column, SECONDARY), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getSliceColumnCheckpoints( + int column, + ColumnEncodingKind encoding, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (encoding == DIRECT || encoding == DIRECT_V2) { + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), new ByteArrayStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(LENGTH)) { + checkpoints.put(new StreamId(column, LENGTH), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + } + else if (encoding == DICTIONARY || encoding == DICTIONARY_V2) { + // DWRF has rules inconsistent with the ORC style + if (availableStreams.contains(IN_DICTIONARY)) { + if (availableStreams.contains(ROW_GROUP_DICTIONARY)) { + checkpoints.put(new StreamId(column, ROW_GROUP_DICTIONARY), new ByteArrayStreamCheckpoint(compressionKind, positionsList)); + } + + checkpoints.put(new StreamId(column, ROW_GROUP_DICTIONARY_LENGTH), new RowGroupDictionaryLengthStreamCheckpoint(compressionKind, positionsList)); + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + + checkpoints.put(new StreamId(column, IN_DICTIONARY), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + else { + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + } + } + else { + throw new IllegalArgumentException("Unsupported encoding for slice column: " + encoding); + } + + return checkpoints.build(); + } + + private static Map getListOrMapColumnCheckpoints( + int column, + ColumnEncodingKind encoding, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(LENGTH)) { + checkpoints.put(new StreamId(column, LENGTH), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getStructColumnCheckpoints( + int column, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static StreamCheckpoint createLongStreamCheckpoint(ColumnEncodingKind encoding, CompressionKind compressionKind, ColumnPositionsList positionsList) + { + if (encoding == DIRECT_V2 || encoding == DICTIONARY_V2) { + return new LongStreamV2Checkpoint(compressionKind, positionsList); + } + + if (encoding == DIRECT || encoding == DICTIONARY) { + return new LongStreamV1Checkpoint(compressionKind, positionsList); + } + + if (encoding == DWRF_DIRECT) { + return new LongStreamDwrfCheckpoint(compressionKind, positionsList); + } + + throw new IllegalArgumentException("Unsupported encoding for long stream: " + encoding); + } + + public static class ColumnPositionsList + { + private final int column; + private final OrcTypeKind columnType; + private final List positionsList; + private int index; + + private ColumnPositionsList(int column, OrcTypeKind columnType, List positionsList) + { + this.column = column; + this.columnType = checkNotNull(columnType, "columnType is null"); + this.positionsList = ImmutableList.copyOf(checkNotNull(positionsList, "positionsList is null")); + } + + public int getIndex() + { + return index; + } + + public boolean hasNextPosition() + { + return index < positionsList.size(); + } + + public int nextPosition() + { + if (!hasNextPosition()) { + throw new InvalidCheckpointException("Not enough positions for column %s, of type %s, checkpoints", + column, + columnType); + } + + return positionsList.get(index++); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java new file mode 100644 index 0000000000..62bf0413a5 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java @@ -0,0 +1,50 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import com.google.common.base.MoreObjects; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class DoubleStreamCheckpoint + implements StreamCheckpoint +{ + private final long inputStreamCheckpoint; + + public DoubleStreamCheckpoint(long inputStreamCheckpoint) + { + this.inputStreamCheckpoint = inputStreamCheckpoint; + } + + public DoubleStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java new file mode 100644 index 0000000000..4edb28787a --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java @@ -0,0 +1,50 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import com.google.common.base.MoreObjects; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class FloatStreamCheckpoint + implements StreamCheckpoint +{ + private final long inputStreamCheckpoint; + + public FloatStreamCheckpoint(long inputStreamCheckpoint) + { + this.inputStreamCheckpoint = inputStreamCheckpoint; + } + + public FloatStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java new file mode 100644 index 0000000000..b9bf773163 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java @@ -0,0 +1,64 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import com.google.common.base.MoreObjects; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.UNCOMPRESSED; + +/** + * InputStreamCheckpoint is represented as a packed long to avoid object creation in inner loops. + */ +public final class InputStreamCheckpoint +{ + private InputStreamCheckpoint() + { + } + + public static long createInputStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + if (compressionKind == UNCOMPRESSED) { + return createInputStreamCheckpoint(0, positionsList.nextPosition()); + } + else { + return createInputStreamCheckpoint(positionsList.nextPosition(), positionsList.nextPosition()); + } + } + + public static long createInputStreamCheckpoint(int compressedBlockOffset, int decompressedOffset) + { + return (((long) compressedBlockOffset) << 32) | decompressedOffset; + } + + public static int decodeCompressedBlockOffset(long inputStreamCheckpoint) + { + return ((int) (inputStreamCheckpoint >> 32)); + } + + public static int decodeDecompressedOffset(long inputStreamCheckpoint) + { + // low order bits contain the decompressed offset, so a simple cast here will suffice + return (int) inputStreamCheckpoint; + } + + public static String inputStreamCheckpointToString(long inputStreamCheckpoint) + { + return MoreObjects.toStringHelper(InputStreamCheckpoint.class) + .add("decompressedOffset", decodeDecompressedOffset(inputStreamCheckpoint)) + .add("compressedBlockOffset", decodeCompressedBlockOffset(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InvalidCheckpointException.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InvalidCheckpointException.java new file mode 100644 index 0000000000..e8438369a0 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InvalidCheckpointException.java @@ -0,0 +1,25 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import static java.lang.String.format; + +public class InvalidCheckpointException + extends RuntimeException +{ + public InvalidCheckpointException(String message, Object... arguments) + { + super(format(message, arguments)); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamCheckpoint.java new file mode 100644 index 0000000000..a142e39f5d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamCheckpoint.java @@ -0,0 +1,19 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +public interface LongStreamCheckpoint + extends StreamCheckpoint +{ +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java new file mode 100644 index 0000000000..655641f412 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java @@ -0,0 +1,50 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import com.google.common.base.MoreObjects; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class LongStreamDwrfCheckpoint + implements LongStreamCheckpoint +{ + private final long inputStreamCheckpoint; + + public LongStreamDwrfCheckpoint(long inputStreamCheckpoint) + { + this.inputStreamCheckpoint = inputStreamCheckpoint; + } + + public LongStreamDwrfCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java new file mode 100644 index 0000000000..2b7a56b850 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import com.google.common.base.MoreObjects; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; +import static com.google.common.base.Preconditions.checkNotNull; + +public class LongStreamV1Checkpoint + implements LongStreamCheckpoint +{ + private final int offset; + private final long inputStreamCheckpoint; + + public LongStreamV1Checkpoint(int offset, long inputStreamCheckpoint) + { + this.offset = offset; + this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); + } + + public LongStreamV1Checkpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + offset = positionsList.nextPosition(); + } + + public int getOffset() + { + return offset; + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("offset", offset) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java new file mode 100644 index 0000000000..680a8982e6 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import com.google.common.base.MoreObjects; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; +import static com.google.common.base.Preconditions.checkNotNull; + +public final class LongStreamV2Checkpoint + implements LongStreamCheckpoint +{ + private final int offset; + private final long inputStreamCheckpoint; + + public LongStreamV2Checkpoint(int offset, long inputStreamCheckpoint) + { + this.offset = offset; + this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); + } + + public LongStreamV2Checkpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + offset = positionsList.nextPosition(); + } + + public int getOffset() + { + return offset; + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("offset", offset) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java new file mode 100644 index 0000000000..97d28e0e19 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import com.google.common.base.MoreObjects; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class RowGroupDictionaryLengthStreamCheckpoint + extends LongStreamV1Checkpoint +{ + private final int rowGroupDictionarySize; + + public RowGroupDictionaryLengthStreamCheckpoint(int rowGroupDictionarySize, int offset, long inputStreamCheckpoint) + { + super(offset, inputStreamCheckpoint); + this.rowGroupDictionarySize = rowGroupDictionarySize; + } + + public RowGroupDictionaryLengthStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + super(compressionKind, positionsList); + rowGroupDictionarySize = positionsList.nextPosition(); + } + + public int getRowGroupDictionarySize() + { + return rowGroupDictionarySize; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("rowGroupDictionarySize", rowGroupDictionarySize) + .add("offset", getOffset()) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(getInputStreamCheckpoint())) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/StreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/StreamCheckpoint.java new file mode 100644 index 0000000000..025c2ae5ad --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/StreamCheckpoint.java @@ -0,0 +1,18 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +public interface StreamCheckpoint +{ +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/BooleanStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/BooleanStatistics.java new file mode 100644 index 0000000000..971cfe3779 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/BooleanStatistics.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +public class BooleanStatistics +{ + private final long trueValueCount; + + public BooleanStatistics(long trueValueCount) + { + this.trueValueCount = trueValueCount; + } + + public long getTrueValueCount() + { + return trueValueCount; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnEncoding.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnEncoding.java new file mode 100644 index 0000000000..5713a28ec4 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnEncoding.java @@ -0,0 +1,57 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class ColumnEncoding +{ + public enum ColumnEncodingKind + { + DIRECT, + DICTIONARY, + DIRECT_V2, + DICTIONARY_V2, + DWRF_DIRECT, + } + + private final ColumnEncodingKind columnEncodingKind; + private final int dictionarySize; + + public ColumnEncoding(ColumnEncodingKind columnEncodingKind, int dictionarySize) + { + this.columnEncodingKind = checkNotNull(columnEncodingKind, "columnEncodingKind is null"); + this.dictionarySize = dictionarySize; + } + + public ColumnEncodingKind getColumnEncodingKind() + { + return columnEncodingKind; + } + + public int getDictionarySize() + { + return dictionarySize; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("columnEncodingKind", columnEncodingKind) + .add("dictionarySize", dictionarySize) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnStatistics.java new file mode 100644 index 0000000000..7f1e0d988a --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnStatistics.java @@ -0,0 +1,74 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +public class ColumnStatistics +{ + private final Long numberOfValues; + private final BooleanStatistics booleanStatistics; + private final IntegerStatistics integerStatistics; + private final DoubleStatistics doubleStatistics; + private final StringStatistics stringStatistics; + private final DateStatistics dateStatistics; + + public ColumnStatistics(Long numberOfValues, + BooleanStatistics booleanStatistics, + IntegerStatistics integerStatistics, + DoubleStatistics doubleStatistics, + StringStatistics stringStatistics, + DateStatistics dateStatistics) + { + this.numberOfValues = numberOfValues; + this.booleanStatistics = booleanStatistics; + this.integerStatistics = integerStatistics; + this.doubleStatistics = doubleStatistics; + this.stringStatistics = stringStatistics; + this.dateStatistics = dateStatistics; + } + + public boolean hasNumberOfValues() + { + return numberOfValues != null; + } + + public long getNumberOfValues() + { + return numberOfValues == null ? 0 : numberOfValues; + } + + public BooleanStatistics getBooleanStatistics() + { + return booleanStatistics; + } + + public DateStatistics getDateStatistics() + { + return dateStatistics; + } + + public DoubleStatistics getDoubleStatistics() + { + return doubleStatistics; + } + + public IntegerStatistics getIntegerStatistics() + { + return integerStatistics; + } + + public StringStatistics getStringStatistics() + { + return stringStatistics; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/CompressionKind.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/CompressionKind.java new file mode 100644 index 0000000000..1b34f17e9e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/CompressionKind.java @@ -0,0 +1,19 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +public enum CompressionKind +{ + UNCOMPRESSED, ZLIB, SNAPPY +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DateStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DateStatistics.java new file mode 100644 index 0000000000..5a1b409506 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DateStatistics.java @@ -0,0 +1,39 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +public class DateStatistics + implements RangeStatistics +{ + private final Integer minimum; + private final Integer maximum; + + public DateStatistics(Integer minimum, Integer maximum) + { + this.minimum = minimum; + this.maximum = maximum; + } + + @Override + public Integer getMin() + { + return minimum; + } + + @Override + public Integer getMax() + { + return maximum; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DoubleStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DoubleStatistics.java new file mode 100644 index 0000000000..5bb13e7307 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DoubleStatistics.java @@ -0,0 +1,39 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +public class DoubleStatistics + implements RangeStatistics +{ + private final Double minimum; + private final Double maximum; + + public DoubleStatistics(Double minimum, Double maximum) + { + this.minimum = minimum; + this.maximum = maximum; + } + + @Override + public Double getMin() + { + return minimum; + } + + @Override + public Double getMax() + { + return maximum; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java new file mode 100644 index 0000000000..73edd32004 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java @@ -0,0 +1,373 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import com.facebook.hive.orc.OrcProto; +import com.facebook.hive.orc.OrcProto.ColumnEncoding.Kind; +import com.google.common.base.Function; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; +import com.google.common.primitives.Ints; +import com.google.protobuf.CodedInputStream; +import io.airlift.slice.Slice; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; +import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcMetadataReader.getMaxSlice; +import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcMetadataReader.getMinSlice; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; + +public class DwrfMetadataReader + implements MetadataReader +{ + @Override + public PostScript readPostScript(byte[] data, int offset, int length) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(data, offset, length); + OrcProto.PostScript postScript = OrcProto.PostScript.parseFrom(input); + + return new PostScript( + ImmutableList.of(), + postScript.getFooterLength(), + 0, + toCompression(postScript.getCompression()), + postScript.getCompressionBlockSize()); + } + + @Override + public Metadata readMetadata(InputStream inputStream) + throws IOException + { + return new Metadata(ImmutableList.of()); + } + + @Override + public Footer readFooter(InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.Footer footer = OrcProto.Footer.parseFrom(input); + return new Footer( + footer.getNumberOfRows(), + footer.getRowIndexStride(), + toStripeInformation(footer.getStripesList()), + toType(footer.getTypesList()), + toColumnStatistics(footer.getStatisticsList(), false)); + } + + private static List toStripeInformation(List types) + { + // Modifying for JDK 1.6 + // return ImmutableList.copyOf(Iterables.transform(types, DwrfMetadataReader::toStripeInformation)); + return ImmutableList.copyOf(Iterables.transform(types, new Function() { + @Nullable + @Override + public StripeInformation apply(OrcProto.StripeInformation stripeInformation) { + return toStripeInformation(stripeInformation); + } + })); + } + + private static StripeInformation toStripeInformation(OrcProto.StripeInformation stripeInformation) + { + return new StripeInformation( + Ints.checkedCast(stripeInformation.getNumberOfRows()), + stripeInformation.getOffset(), + stripeInformation.getIndexLength(), + stripeInformation.getDataLength(), + stripeInformation.getFooterLength()); + } + + @Override + public StripeFooter readStripeFooter(List types, InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.StripeFooter stripeFooter = OrcProto.StripeFooter.parseFrom(input); + return new StripeFooter(toStream(stripeFooter.getStreamsList()), toColumnEncoding(types, stripeFooter.getColumnsList())); + } + + private static Stream toStream(OrcProto.Stream stream) + { + return new Stream(stream.getColumn(), toStreamKind(stream.getKind()), Ints.checkedCast(stream.getLength()), stream.getUseVInts()); + } + + private static List toStream(List streams) + { + // Modifying for JDK 1.6 + // return ImmutableList.copyOf(Iterables.transform(streams, DwrfMetadataReader::toStream)); + return ImmutableList.copyOf(Iterables.transform(streams, new Function() { + @Nullable + @Override + public Stream apply(@Nullable OrcProto.Stream stream) { + return toStream(stream); + } + })); + } + + private static ColumnEncoding toColumnEncoding(OrcTypeKind type, OrcProto.ColumnEncoding columnEncoding) + { + return new ColumnEncoding(toColumnEncodingKind(type, columnEncoding.getKind()), columnEncoding.getDictionarySize()); + } + + private static List toColumnEncoding(List types, List columnEncodings) + { + checkArgument(types.size() == columnEncodings.size()); + + ImmutableList.Builder encodings = ImmutableList.builder(); + for (int i = 0; i < types.size(); i++) { + OrcType type = types.get(i); + encodings.add(toColumnEncoding(type.getOrcTypeKind(), columnEncodings.get(i))); + } + return encodings.build(); + } + + @Override + public List readRowIndexes(InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.RowIndex rowIndex = OrcProto.RowIndex.parseFrom(input); + + // Modifying for JDK 1.6 + // return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), DwrfMetadataReader::toRowGroupIndex)); + return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() { + @Nullable + @Override + public RowGroupIndex apply(@Nullable OrcProto.RowIndexEntry rowIndexEntry) { + return toRowGroupIndex(rowIndexEntry); + } + })); + } + + private static RowGroupIndex toRowGroupIndex(OrcProto.RowIndexEntry rowIndexEntry) + { + List positionsList = rowIndexEntry.getPositionsList(); + ImmutableList.Builder positions = ImmutableList.builder(); + for (int index = 0; index < positionsList.size(); index++) { + long longPosition = positionsList.get(index); + int intPosition = (int) longPosition; + + checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index); + + positions.add(intPosition); + } + return new RowGroupIndex(positions.build(), toColumnStatistics(rowIndexEntry.getStatistics(), true)); + } + + private static List toColumnStatistics(List columnStatistics, final boolean isRowGroup) + { + if (columnStatistics == null) { + return ImmutableList.of(); + } + + // Modifying for JDK 1.6 + // return ImmutableList.copyOf(Iterables.transform(columnStatistics, statistics -> toColumnStatistics(statistics, isRowGroup))); + return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() { + @Nullable + @Override + public ColumnStatistics apply(@Nullable OrcProto.ColumnStatistics columnStatistics) { + return toColumnStatistics(columnStatistics, isRowGroup); + } + })); + } + + private static ColumnStatistics toColumnStatistics(OrcProto.ColumnStatistics statistics, boolean isRowGroup) + { + return new ColumnStatistics( + statistics.getNumberOfValues(), + toBooleanStatistics(statistics.getBucketStatistics()), + toIntegerStatistics(statistics.getIntStatistics()), + toDoubleStatistics(statistics.getDoubleStatistics()), + toStringStatistics(statistics.getStringStatistics(), isRowGroup), + null); + } + + private static BooleanStatistics toBooleanStatistics(OrcProto.BucketStatistics bucketStatistics) + { + if (bucketStatistics.getCountCount() == 0) { + return null; + } + + return new BooleanStatistics(bucketStatistics.getCount(0)); + } + + private static IntegerStatistics toIntegerStatistics(OrcProto.IntegerStatistics integerStatistics) + { + if (!integerStatistics.hasMinimum() && !integerStatistics.hasMaximum()) { + return null; + } + + return new IntegerStatistics( + integerStatistics.hasMinimum() ? integerStatistics.getMinimum() : null, + integerStatistics.hasMaximum() ? integerStatistics.getMaximum() : null); + } + + private static DoubleStatistics toDoubleStatistics(OrcProto.DoubleStatistics doubleStatistics) + { + if (!doubleStatistics.hasMinimum() && !doubleStatistics.hasMaximum()) { + return null; + } + + // if either min, max, or sum is NaN, ignore the stat + if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) || + (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum())) || + (doubleStatistics.hasSum() && Double.isNaN(doubleStatistics.getSum()))) { + return null; + } + + return new DoubleStatistics( + doubleStatistics.hasMinimum() ? doubleStatistics.getMinimum() : null, + doubleStatistics.hasMaximum() ? doubleStatistics.getMaximum() : null); + } + + private static StringStatistics toStringStatistics(OrcProto.StringStatistics stringStatistics, boolean isRowGroup) + { + // TODO remove this when string statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 + if (!isRowGroup) { + return null; + } + + if (!stringStatistics.hasMinimum() && !stringStatistics.hasMaximum()) { + return null; + } + + Slice minimum = stringStatistics.hasMinimum() ? getMinSlice(stringStatistics.getMinimum()) : null; + Slice maximum = stringStatistics.hasMaximum() ? getMaxSlice(stringStatistics.getMaximum()) : null; + + return new StringStatistics(minimum, maximum); + } + + private static OrcType toType(OrcProto.Type type) + { + return new OrcType(toTypeKind(type.getKind()), type.getSubtypesList(), type.getFieldNamesList()); + } + + private static List toType(List types) + { + // Modifying for JDK 1.6 + // return ImmutableList.copyOf(Iterables.transform(types, DwrfMetadataReader::toType)); + return ImmutableList.copyOf(Iterables.transform(types, new Function() { + @Nullable + @Override + public OrcType apply(@Nullable OrcProto.Type type) { + return toType(type); + } + })); + } + + private static OrcTypeKind toTypeKind(OrcProto.Type.Kind kind) + { + switch (kind) { + case BOOLEAN: + return OrcTypeKind.BOOLEAN; + case BYTE: + return OrcTypeKind.BYTE; + case SHORT: + return OrcTypeKind.SHORT; + case INT: + return OrcTypeKind.INT; + case LONG: + return OrcTypeKind.LONG; + case FLOAT: + return OrcTypeKind.FLOAT; + case DOUBLE: + return OrcTypeKind.DOUBLE; + case STRING: + return OrcTypeKind.STRING; + case BINARY: + return OrcTypeKind.BINARY; + case TIMESTAMP: + return OrcTypeKind.TIMESTAMP; + case LIST: + return OrcTypeKind.LIST; + case MAP: + return OrcTypeKind.MAP; + case STRUCT: + return OrcTypeKind.STRUCT; + case UNION: + return OrcTypeKind.UNION; + default: + throw new IllegalArgumentException(kind + " data type not implemented yet"); + } + } + + private static StreamKind toStreamKind(OrcProto.Stream.Kind kind) + { + switch (kind) { + case PRESENT: + return StreamKind.PRESENT; + case DATA: + return StreamKind.DATA; + case LENGTH: + return StreamKind.LENGTH; + case DICTIONARY_DATA: + return StreamKind.DICTIONARY_DATA; + case DICTIONARY_COUNT: + return StreamKind.DICTIONARY_COUNT; + case NANO_DATA: + return StreamKind.SECONDARY; + case ROW_INDEX: + return StreamKind.ROW_INDEX; + case IN_DICTIONARY: + return StreamKind.IN_DICTIONARY; + case STRIDE_DICTIONARY: + return StreamKind.ROW_GROUP_DICTIONARY; + case STRIDE_DICTIONARY_LENGTH: + return StreamKind.ROW_GROUP_DICTIONARY_LENGTH; + default: + throw new IllegalArgumentException(kind + " stream type not implemented yet"); + } + } + + private static ColumnEncodingKind toColumnEncodingKind(OrcTypeKind type, Kind kind) + { + switch (kind) { + case DIRECT: + if (type == OrcTypeKind.SHORT || type == OrcTypeKind.INT || type == OrcTypeKind.LONG) { + return ColumnEncodingKind.DWRF_DIRECT; + } + else { + return ColumnEncodingKind.DIRECT; + } + case DICTIONARY: + return ColumnEncodingKind.DICTIONARY; + default: + throw new IllegalArgumentException(kind + " stream encoding not implemented yet"); + } + } + + private static CompressionKind toCompression(OrcProto.CompressionKind compression) + { + switch (compression) { + case NONE: + return UNCOMPRESSED; + case ZLIB: + return ZLIB; + case SNAPPY: + return SNAPPY; + default: + throw new IllegalArgumentException(compression + " compression not implemented yet"); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Footer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Footer.java new file mode 100644 index 0000000000..54bc53fa54 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Footer.java @@ -0,0 +1,76 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import com.google.common.collect.ImmutableList; + +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class Footer +{ + private final long numberOfRows; + private final int rowsInRowGroup; + private final List stripes; + private final List types; + private final List fileStats; + + public Footer(long numberOfRows, int rowsInRowGroup, List stripes, List types, List fileStats) + { + this.numberOfRows = numberOfRows; + this.rowsInRowGroup = rowsInRowGroup; + this.stripes = ImmutableList.copyOf(checkNotNull(stripes, "stripes is null")); + this.types = ImmutableList.copyOf(checkNotNull(types, "types is null")); + this.fileStats = ImmutableList.copyOf(checkNotNull(fileStats, "columnStatistics is null")); + } + + public long getNumberOfRows() + { + return numberOfRows; + } + + public int getRowsInRowGroup() + { + return rowsInRowGroup; + } + + public List getStripes() + { + return stripes; + } + + public List getTypes() + { + return types; + } + + public List getFileStats() + { + return fileStats; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("numberOfRows", numberOfRows) + .add("rowsInRowGroup", rowsInRowGroup) + .add("stripes", stripes) + .add("types", types) + .add("columnStatistics", fileStats) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/IntegerStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/IntegerStatistics.java new file mode 100644 index 0000000000..6ab7b1252c --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/IntegerStatistics.java @@ -0,0 +1,37 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +public class IntegerStatistics + implements RangeStatistics +{ + private final Long minimum; + private final Long maximum; + + public IntegerStatistics(Long minimum, Long maximum) + { + this.minimum = minimum; + this.maximum = maximum; + } + + public Long getMin() + { + return minimum; + } + + public Long getMax() + { + return maximum; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Metadata.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Metadata.java new file mode 100644 index 0000000000..29d8068332 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Metadata.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import java.util.List; + +public class Metadata +{ + private final List stripeStatistics; + + public Metadata(List stripeStatistics) + { + this.stripeStatistics = stripeStatistics; + } + + public List getStripeStatsList() + { + return stripeStatistics; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/MetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/MetadataReader.java new file mode 100644 index 0000000000..a6805dc94f --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/MetadataReader.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +public interface MetadataReader +{ + PostScript readPostScript(byte[] data, int offset, int length) + throws IOException; + + Metadata readMetadata(InputStream inputStream) + throws IOException; + + Footer readFooter(InputStream inputStream) + throws IOException; + + StripeFooter readStripeFooter(List types, InputStream inputStream) + throws IOException; + + List readRowIndexes(InputStream inputStream) + throws IOException; +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java new file mode 100644 index 0000000000..e83221775f --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java @@ -0,0 +1,487 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import com.facebook.presto.hive.protobuf.CodedInputStream; +import com.google.common.base.Function; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; +import com.google.common.primitives.Ints; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.hadoop.hive.ql.io.orc.OrcProto; +import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; +import static com.google.common.base.Preconditions.checkState; +import static java.lang.Character.MIN_SURROGATE; + +public class OrcMetadataReader + implements MetadataReader +{ + private static final Slice MAX_BYTE = Slices.wrappedBuffer(new byte[] { (byte) 0xFF }); + + @Override + public PostScript readPostScript(byte[] data, int offset, int length) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(data, offset, length); + OrcProto.PostScript postScript = OrcProto.PostScript.parseFrom(input); + + return new PostScript( + postScript.getVersionList(), + postScript.getFooterLength(), + postScript.getMetadataLength(), + toCompression(postScript.getCompression()), + postScript.getCompressionBlockSize()); + } + + @Override + public Metadata readMetadata(InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.Metadata metadata = OrcProto.Metadata.parseFrom(input); + return new Metadata(toStripeStatistics(metadata.getStripeStatsList())); + } + + private static List toStripeStatistics(List types) + { + // Modifying for JDK 1.6 + // return ImmutableList.copyOf(Iterables.transform(types, OrcMetadataReader::toStripeStatistics)); + return ImmutableList.copyOf(Iterables.transform(types, new Function() { + @Nullable + @Override + public StripeStatistics apply(@Nullable OrcProto.StripeStatistics stripeStatistics) { + return new StripeStatistics(toColumnStatistics(stripeStatistics.getColStatsList(), false)); + } + })); + } + + private static StripeStatistics toStripeStatistics(OrcProto.StripeStatistics stripeStatistics) + { + return new StripeStatistics(toColumnStatistics(stripeStatistics.getColStatsList(), false)); + } + + @Override + public Footer readFooter(InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.Footer footer = OrcProto.Footer.parseFrom(input); + return new Footer( + footer.getNumberOfRows(), + footer.getRowIndexStride(), + toStripeInformation(footer.getStripesList()), + toType(footer.getTypesList()), + toColumnStatistics(footer.getStatisticsList(), false)); + } + + private static List toStripeInformation(List types) + { + // Modifying for JDK 1.6 + return ImmutableList.copyOf(Iterables.transform(types, new Function() { + @Nullable + @Override + public StripeInformation apply(@Nullable OrcProto.StripeInformation stripeInformation) { + return toStripeInformation(stripeInformation); + } + })); + } + + private static StripeInformation toStripeInformation(OrcProto.StripeInformation stripeInformation) + { + return new StripeInformation( + Ints.checkedCast(stripeInformation.getNumberOfRows()), + stripeInformation.getOffset(), + stripeInformation.getIndexLength(), + stripeInformation.getDataLength(), + stripeInformation.getFooterLength()); + } + + @Override + public StripeFooter readStripeFooter(List types, InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.StripeFooter stripeFooter = OrcProto.StripeFooter.parseFrom(input); + return new StripeFooter(toStream(stripeFooter.getStreamsList()), toColumnEncoding(stripeFooter.getColumnsList())); + } + + private static Stream toStream(OrcProto.Stream stream) + { + return new Stream(stream.getColumn(), toStreamKind(stream.getKind()), Ints.checkedCast(stream.getLength()), true); + } + + private static List toStream(List streams) + { + // Modifying for JDK 1.6 + //return ImmutableList.copyOf(Iterables.transform(streams, OrcMetadataReader::toStream)); + return ImmutableList.copyOf(Iterables.transform(streams, new Function() { + @Nullable + @Override + public Stream apply(@Nullable OrcProto.Stream stream) { + return toStream(stream); + } + })); + } + + private static ColumnEncoding toColumnEncoding(OrcProto.ColumnEncoding columnEncoding) + { + return new ColumnEncoding(toColumnEncodingKind(columnEncoding.getKind()), columnEncoding.getDictionarySize()); + } + + private static List toColumnEncoding(List columnEncodings) + { + // Modifying for JDK 1.6 + // return ImmutableList.copyOf(Iterables.transform(columnEncodings, OrcMetadataReader::toColumnEncoding)); + return ImmutableList.copyOf(Iterables.transform(columnEncodings, new Function() { + @Nullable + @Override + public ColumnEncoding apply(@Nullable OrcProto.ColumnEncoding columnEncoding) { + return toColumnEncoding(columnEncoding); + } + })); + } + + @Override + public List readRowIndexes(InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.RowIndex rowIndex = OrcProto.RowIndex.parseFrom(input); + + // Modifying for JDK 1.6 + //return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), OrcMetadataReader::toRowGroupIndex)); + return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() { + @Nullable + @Override + public RowGroupIndex apply(@Nullable RowIndexEntry rowIndexEntry) { + return toRowGroupIndex(rowIndexEntry); + } + })); + } + + private static RowGroupIndex toRowGroupIndex(RowIndexEntry rowIndexEntry) + { + List positionsList = rowIndexEntry.getPositionsList(); + ImmutableList.Builder positions = ImmutableList.builder(); + for (int index = 0; index < positionsList.size(); index++) { + long longPosition = positionsList.get(index); + int intPosition = (int) longPosition; + + checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index); + + positions.add(intPosition); + } + return new RowGroupIndex(positions.build(), toColumnStatistics(rowIndexEntry.getStatistics(), true)); + } + + private static ColumnStatistics toColumnStatistics(OrcProto.ColumnStatistics statistics, boolean isRowGroup) + { + return new ColumnStatistics( + statistics.getNumberOfValues(), + toBooleanStatistics(statistics.getBucketStatistics()), + toIntegerStatistics(statistics.getIntStatistics()), + toDoubleStatistics(statistics.getDoubleStatistics()), + toStringStatistics(statistics.getStringStatistics(), isRowGroup), + toDateStatistics(statistics.getDateStatistics(), isRowGroup)); + } + + private static List toColumnStatistics(List columnStatistics, final boolean isRowGroup) + { + if (columnStatistics == null) { + return ImmutableList.of(); + } + + // Modifying for JDK 1.6 + // return ImmutableList.copyOf(Iterables.transform(columnStatistics, statistics -> toColumnStatistics(statistics, isRowGroup))); + return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() { + @Nullable + @Override + public ColumnStatistics apply(@Nullable OrcProto.ColumnStatistics columnStatistics) { + return toColumnStatistics(columnStatistics, isRowGroup); + } + })); + } + + private static BooleanStatistics toBooleanStatistics(OrcProto.BucketStatistics bucketStatistics) + { + if (bucketStatistics.getCountCount() == 0) { + return null; + } + + return new BooleanStatistics(bucketStatistics.getCount(0)); + } + + private static IntegerStatistics toIntegerStatistics(OrcProto.IntegerStatistics integerStatistics) + { + if (!integerStatistics.hasMinimum() && !integerStatistics.hasMaximum()) { + return null; + } + + return new IntegerStatistics( + integerStatistics.hasMinimum() ? integerStatistics.getMinimum() : null, + integerStatistics.hasMaximum() ? integerStatistics.getMaximum() : null); + } + + private static DoubleStatistics toDoubleStatistics(OrcProto.DoubleStatistics doubleStatistics) + { + if (!doubleStatistics.hasMinimum() && !doubleStatistics.hasMaximum()) { + return null; + } + + // TODO remove this when double statistics are changed to correctly deal with NaNs + // if either min, max, or sum is NaN, ignore the stat + if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) || + (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum())) || + (doubleStatistics.hasSum() && Double.isNaN(doubleStatistics.getSum()))) { + return null; + } + + return new DoubleStatistics( + doubleStatistics.hasMinimum() ? doubleStatistics.getMinimum() : null, + doubleStatistics.hasMaximum() ? doubleStatistics.getMaximum() : null); + } + + private static StringStatistics toStringStatistics(OrcProto.StringStatistics stringStatistics, boolean isRowGroup) + { + // TODO remove this when string statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 + if (!isRowGroup) { + return null; + } + + if (!stringStatistics.hasMinimum() && !stringStatistics.hasMaximum()) { + return null; + } + + /* + The writer performs comparisons using java Strings to determine the minimum and maximum + values. This results in weird behaviors in the presence of surrogate pairs and special characters. + + For example, unicode codepoint 0x1D403 has the following representations: + UTF-16: [0xD835, 0xDC03] + UTF-8: [0xF0, 0x9D, 0x90, 0x83] + + while codepoint 0xFFFD (the replacement character) has the following representations: + UTF-16: [0xFFFD] + UTF-8: [0xEF, 0xBF, 0xBD] + + when comparisons between strings containing these characters are done with Java Strings (UTF-16), + 0x1D403 < 0xFFFD, but when comparisons are done using raw codepoints or UTF-8, 0x1D403 > 0xFFFD + + We use the following logic to ensure that we have a wider range of min-max + * if a min string has a surrogate character, the min string is truncated + at the first occurrence of the surrogate character (to exclude the surrogate character) + * if a max string has a surrogate character, the max string is truncated + at the first occurrence the surrogate character and 0xFF byte is appended to it. + + */ + Slice minimum = stringStatistics.hasMinimum() ? getMinSlice(stringStatistics.getMinimum()) : null; + Slice maximum = stringStatistics.hasMaximum() ? getMaxSlice(stringStatistics.getMaximum()) : null; + + return new StringStatistics(minimum, maximum); + } + + @VisibleForTesting + public static Slice getMaxSlice(String maximum) + { + if (maximum == null) { + return null; + } + + int index = firstSurrogateCharacter(maximum); + if (index == -1) { + return Slices.utf8Slice(maximum); + } + // Append 0xFF so that it is larger than maximum + return concatSlices(Slices.utf8Slice(maximum.substring(0, index)), MAX_BYTE); + } + + @VisibleForTesting + public static Slice getMinSlice(String minimum) + { + if (minimum == null) { + return null; + } + + int index = firstSurrogateCharacter(minimum); + if (index == -1) { + return Slices.utf8Slice(minimum); + } + // truncate the string at the first surrogate character + return Slices.utf8Slice(minimum.substring(0, index)); + } + + // returns index of first surrogateCharacter in the string -1 if no surrogate character is found + @VisibleForTesting + static int firstSurrogateCharacter(String value) + { + char[] chars = value.toCharArray(); + for (int i = 0; i < chars.length; i++) { + if (chars[i] >= MIN_SURROGATE) { + return i; + } + } + return -1; + } + + @VisibleForTesting + static Slice concatSlices(Slice slice1, Slice slice2) + { + Slice slice = Slices.allocate(slice1.length() + slice2.length()); + slice.setBytes(0, slice1.getBytes()); + slice.setBytes(slice1.length(), slice2.getBytes()); + return slice; + } + + private static DateStatistics toDateStatistics(OrcProto.DateStatistics dateStatistics, boolean isRowGroup) + { + // TODO remove this when date statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 + if (!isRowGroup) { + return null; + } + + if (!dateStatistics.hasMinimum() && !dateStatistics.hasMaximum()) { + return null; + } + + return new DateStatistics( + dateStatistics.hasMinimum() ? dateStatistics.getMinimum() : null, + dateStatistics.hasMaximum() ? dateStatistics.getMaximum() : null); + } + + private static OrcType toType(OrcProto.Type type) + { + return new OrcType(toTypeKind(type.getKind()), type.getSubtypesList(), type.getFieldNamesList()); + } + + private static List toType(List types) + { + // Modifying for JDK 1.6 + // return ImmutableList.copyOf(Iterables.transform(types, OrcMetadataReader::toType)); + return ImmutableList.copyOf(Iterables.transform(types, new Function() { + @Nullable + @Override + public OrcType apply(@Nullable OrcProto.Type type) { + return toType(type); + } + })); + } + + private static OrcTypeKind toTypeKind(OrcProto.Type.Kind typeKind) + { + switch (typeKind) { + case BOOLEAN: + return OrcTypeKind.BOOLEAN; + case BYTE: + return OrcTypeKind.BYTE; + case SHORT: + return OrcTypeKind.SHORT; + case INT: + return OrcTypeKind.INT; + case LONG: + return OrcTypeKind.LONG; + case FLOAT: + return OrcTypeKind.FLOAT; + case DOUBLE: + return OrcTypeKind.DOUBLE; + case STRING: + return OrcTypeKind.STRING; + case BINARY: + return OrcTypeKind.BINARY; + case TIMESTAMP: + return OrcTypeKind.TIMESTAMP; + case LIST: + return OrcTypeKind.LIST; + case MAP: + return OrcTypeKind.MAP; + case STRUCT: + return OrcTypeKind.STRUCT; + case UNION: + return OrcTypeKind.UNION; + case DECIMAL: + return OrcTypeKind.DECIMAL; + case DATE: + return OrcTypeKind.DATE; + case VARCHAR: + return OrcTypeKind.VARCHAR; + case CHAR: + return OrcTypeKind.CHAR; + default: + throw new IllegalStateException(typeKind + " stream type not implemented yet"); + } + } + + private static StreamKind toStreamKind(OrcProto.Stream.Kind streamKind) + { + switch (streamKind) { + case PRESENT: + return StreamKind.PRESENT; + case DATA: + return StreamKind.DATA; + case LENGTH: + return StreamKind.LENGTH; + case DICTIONARY_DATA: + return StreamKind.DICTIONARY_DATA; + case DICTIONARY_COUNT: + return StreamKind.DICTIONARY_COUNT; + case SECONDARY: + return StreamKind.SECONDARY; + case ROW_INDEX: + return StreamKind.ROW_INDEX; + default: + throw new IllegalStateException(streamKind + " stream type not implemented yet"); + } + } + + private static ColumnEncodingKind toColumnEncodingKind(OrcProto.ColumnEncoding.Kind columnEncodingKind) + { + switch (columnEncodingKind) { + case DIRECT: + return ColumnEncodingKind.DIRECT; + case DIRECT_V2: + return ColumnEncodingKind.DIRECT_V2; + case DICTIONARY: + return ColumnEncodingKind.DICTIONARY; + case DICTIONARY_V2: + return ColumnEncodingKind.DICTIONARY_V2; + default: + throw new IllegalStateException(columnEncodingKind + " stream encoding not implemented yet"); + } + } + + private static CompressionKind toCompression(OrcProto.CompressionKind compression) + { + switch (compression) { + case NONE: + return UNCOMPRESSED; + case ZLIB: + return ZLIB; + case SNAPPY: + return SNAPPY; + default: + throw new IllegalStateException(compression + " compression not implemented yet"); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcType.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcType.java new file mode 100644 index 0000000000..2690d626a6 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcType.java @@ -0,0 +1,105 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import com.google.common.collect.ImmutableList; + +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +public class OrcType +{ + public enum OrcTypeKind + { + BOOLEAN, + + BYTE, + SHORT, + INT, + LONG, + DECIMAL, + + FLOAT, + DOUBLE, + + STRING, + VARCHAR, + CHAR, + + BINARY, + + DATE, + TIMESTAMP, + + LIST, + MAP, + STRUCT, + UNION, + } + + private final OrcTypeKind orcTypeKind; + private final List fieldTypeIndexes; + private final List fieldNames; + + public OrcType(OrcTypeKind orcTypeKind, List fieldTypeIndexes, List fieldNames) + { + this.orcTypeKind = checkNotNull(orcTypeKind, "typeKind is null"); + this.fieldTypeIndexes = ImmutableList.copyOf(checkNotNull(fieldTypeIndexes, "fieldTypeIndexes is null")); + if (fieldNames == null || (fieldNames.isEmpty() && !fieldTypeIndexes.isEmpty())) { + this.fieldNames = null; + } + else { + this.fieldNames = ImmutableList.copyOf(checkNotNull(fieldNames, "fieldNames is null")); + checkArgument(fieldNames.size() == fieldTypeIndexes.size(), "fieldNames and fieldTypeIndexes have different sizes"); + } + } + + public OrcTypeKind getOrcTypeKind() + { + return orcTypeKind; + } + + public int getFieldCount() + { + return fieldTypeIndexes.size(); + } + + public int getFieldTypeIndex(int field) + { + return fieldTypeIndexes.get(field); + } + + public String getFieldName(int field) + { + return fieldNames.get(field); + } + + public List getFieldNames() + { + return fieldNames; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("orcTypeKind", orcTypeKind) + .add("fieldTypeIndexes", fieldTypeIndexes) + .add("fieldNames", fieldNames) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/PostScript.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/PostScript.java new file mode 100644 index 0000000000..89ea78a632 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/PostScript.java @@ -0,0 +1,76 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import com.google.common.collect.ImmutableList; + +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class PostScript +{ + private final List version; + private final long footerLength; + private final long metadataLength; + private final CompressionKind compression; + private final long compressionBlockSize; + + public PostScript(List version, long footerLength, long metadataLength, CompressionKind compression, long compressionBlockSize) + { + this.version = ImmutableList.copyOf(checkNotNull(version, "version is null")); + this.footerLength = footerLength; + this.metadataLength = metadataLength; + this.compression = checkNotNull(compression, "compressionKind is null"); + this.compressionBlockSize = compressionBlockSize; + } + + public List getVersion() + { + return version; + } + + public long getFooterLength() + { + return footerLength; + } + + public long getMetadataLength() + { + return metadataLength; + } + + public CompressionKind getCompression() + { + return compression; + } + + public long getCompressionBlockSize() + { + return compressionBlockSize; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("version", version) + .add("footerLength", footerLength) + .add("metadataLength", metadataLength) + .add("compressionKind", compression) + .add("compressionBlockSize", compressionBlockSize) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RangeStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RangeStatistics.java new file mode 100644 index 0000000000..325003fb19 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RangeStatistics.java @@ -0,0 +1,20 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +public interface RangeStatistics +{ + T getMin(); + T getMax(); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RowGroupIndex.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RowGroupIndex.java new file mode 100644 index 0000000000..5691882e9d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RowGroupIndex.java @@ -0,0 +1,42 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import com.google.common.collect.ImmutableList; + +import java.util.List; + +import static com.google.common.base.Preconditions.checkNotNull; + +public class RowGroupIndex +{ + private final List positions; + private final ColumnStatistics statistics; + + public RowGroupIndex(List positions, ColumnStatistics statistics) + { + this.positions = ImmutableList.copyOf(checkNotNull(positions, "positions is null")); + this.statistics = checkNotNull(statistics, "statistics is null"); + } + + public List getPositions() + { + return positions; + } + + public ColumnStatistics getColumnStatistics() + { + return statistics; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Stream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Stream.java new file mode 100644 index 0000000000..3440143b0d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Stream.java @@ -0,0 +1,78 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class Stream +{ + public enum StreamKind + { + PRESENT, + DATA, + LENGTH, + DICTIONARY_DATA, + DICTIONARY_COUNT, + SECONDARY, + ROW_INDEX, + IN_DICTIONARY, + ROW_GROUP_DICTIONARY, + ROW_GROUP_DICTIONARY_LENGTH, + } + + private final int column; + private final StreamKind streamKind; + private final int length; + private final boolean useVInts; + + public Stream(int column, StreamKind streamKind, int length, boolean useVInts) + { + this.column = column; + this.streamKind = checkNotNull(streamKind, "streamKind is null"); + this.length = length; + this.useVInts = useVInts; + } + + public int getColumn() + { + return column; + } + + public StreamKind getStreamKind() + { + return streamKind; + } + + public int getLength() + { + return length; + } + + public boolean isUseVInts() + { + return useVInts; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("column", column) + .add("streamKind", streamKind) + .add("length", length) + .add("useVInts", useVInts) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java new file mode 100644 index 0000000000..a61849b8d7 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java @@ -0,0 +1,41 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import io.airlift.slice.Slice; + +public class StringStatistics + implements RangeStatistics +{ + private final Slice minimum; + private final Slice maximum; + + public StringStatistics(Slice minimum, Slice maximum) + { + this.minimum = minimum; + this.maximum = maximum; + } + + @Override + public Slice getMin() + { + return minimum; + } + + @Override + public Slice getMax() + { + return maximum; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeFooter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeFooter.java new file mode 100644 index 0000000000..567ad0d7ad --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeFooter.java @@ -0,0 +1,42 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import com.google.common.collect.ImmutableList; + +import java.util.List; + +import static com.google.common.base.Preconditions.checkNotNull; + +public class StripeFooter +{ + private final List streams; + private final List columnEncodings; + + public StripeFooter(List streams, List columnEncodings) + { + this.streams = ImmutableList.copyOf(checkNotNull(streams, "streams is null")); + this.columnEncodings = ImmutableList.copyOf(checkNotNull(columnEncodings, "columnEncodings is null")); + } + + public List getColumnEncodings() + { + return columnEncodings; + } + + public List getStreams() + { + return streams; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeInformation.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeInformation.java new file mode 100644 index 0000000000..5a5a55fe1a --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeInformation.java @@ -0,0 +1,71 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import static com.google.common.base.MoreObjects.toStringHelper; + +public class StripeInformation +{ + private final int numberOfRows; + private final long offset; + private final long indexLength; + private final long dataLength; + private final long footerLength; + + public StripeInformation(int numberOfRows, long offset, long indexLength, long dataLength, long footerLength) + { + this.numberOfRows = numberOfRows; + this.offset = offset; + this.indexLength = indexLength; + this.dataLength = dataLength; + this.footerLength = footerLength; + } + + public int getNumberOfRows() + { + return numberOfRows; + } + + public long getOffset() + { + return offset; + } + + public long getIndexLength() + { + return indexLength; + } + + public long getDataLength() + { + return dataLength; + } + + public long getFooterLength() + { + return footerLength; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("numberOfRows", numberOfRows) + .add("offset", offset) + .add("indexLength", indexLength) + .add("dataLength", dataLength) + .add("footerLength", footerLength) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeStatistics.java new file mode 100644 index 0000000000..5338325482 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeStatistics.java @@ -0,0 +1,35 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import com.google.common.collect.ImmutableList; + +import java.util.List; + +import static com.google.common.base.Preconditions.checkNotNull; + +public class StripeStatistics +{ + private final List columnStatistics; + + public StripeStatistics(List columnStatistics) + { + this.columnStatistics = ImmutableList.copyOf(checkNotNull(columnStatistics, "columnStatistics is null")); + } + + public List getColumnStatistics() + { + return columnStatistics; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java new file mode 100644 index 0000000000..6ea9dc1e2e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java @@ -0,0 +1,160 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.BooleanVector; +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class BooleanStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream dataStream; + + private boolean rowGroupOpen; + + public BooleanStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.skip(readOffset); + } + } + + BooleanVector booleanVector = castOrcVector(vector, BooleanVector.class); + if (presentStream == null) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + Arrays.fill(booleanVector.isNull, false); + dataStream.getSetBits(nextBatchSize, booleanVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, booleanVector.isNull); + if (nullValues != nextBatchSize) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.getSetBits(nextBatchSize, booleanVector.vector, booleanVector.isNull); + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(BooleanStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, BooleanStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java new file mode 100644 index 0000000000..14e1d6de9e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java @@ -0,0 +1,162 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.LongVector; +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.ByteStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class ByteStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(ByteStream.class); + @Nullable + private ByteStream dataStream; + + private boolean rowGroupOpen; + + public ByteStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.skip(readOffset); + } + } + + LongVector byteVector = castOrcVector(vector, LongVector.class); + if (presentStream == null) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + Arrays.fill(byteVector.isNull, false); + dataStream.nextVector(nextBatchSize, byteVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, byteVector.isNull); + if (nullValues != nextBatchSize) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.nextVector(nextBatchSize, byteVector.vector, byteVector.isNull); + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(ByteStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java new file mode 100644 index 0000000000..db05a04da0 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java @@ -0,0 +1,162 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.DoubleVector; +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.DoubleStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class DoubleStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(DoubleStream.class); + @Nullable + private DoubleStream dataStream; + + private boolean rowGroupOpen; + + public DoubleStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.skip(readOffset); + } + } + + DoubleVector doubleVector = castOrcVector(vector, DoubleVector.class); + if (presentStream == null) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + Arrays.fill(doubleVector.isNull, false); + dataStream.nextVector(nextBatchSize, doubleVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, doubleVector.isNull); + if (nullValues != nextBatchSize) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.nextVector(nextBatchSize, doubleVector.vector, doubleVector.isNull); + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(DoubleStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, DoubleStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java new file mode 100644 index 0000000000..f9c55fd78f --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java @@ -0,0 +1,163 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.DoubleVector; +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.FloatStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class FloatStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(FloatStream.class); + @Nullable + private FloatStream dataStream; + + private boolean rowGroupOpen; + + public FloatStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.skip(readOffset); + } + } + + // we could add a float vector but Presto currently doesn't support floats + DoubleVector floatVector = castOrcVector(vector, DoubleVector.class); + if (presentStream == null) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + Arrays.fill(floatVector.isNull, false); + dataStream.nextVector(nextBatchSize, floatVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, floatVector.isNull); + if (nullValues != nextBatchSize) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.nextVector(nextBatchSize, floatVector.vector, floatVector.isNull); + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(FloatStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, FloatStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java new file mode 100644 index 0000000000..acb15ba5cb --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java @@ -0,0 +1,219 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.LongVector; +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class LongDictionaryStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dictionaryDataStreamSource = missingStreamSource(LongStream.class); + private int dictionarySize; + @Nonnull + private long[] dictionary = new long[0]; + + @Nonnull + private StreamSource inDictionaryStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream inDictionaryStream; + private final boolean[] inDictionary = new boolean[Vector.MAX_VECTOR_LENGTH]; + + @Nonnull + private StreamSource dataStreamSource; + @Nullable + private LongStream dataStream; + + private boolean dictionaryOpen; + private boolean rowGroupOpen; + + public LongDictionaryStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the length reader + readOffset = presentStream.countBitsSet(readOffset); + } + + if (inDictionaryStream != null) { + inDictionaryStream.skip(readOffset); + } + + if (readOffset > 0) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.skip(readOffset); + } + } + + LongVector longVector = castOrcVector(vector, LongVector.class); + + if (presentStream == null) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + Arrays.fill(longVector.isNull, false); + dataStream.nextLongVector(nextBatchSize, longVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); + if (nullValues != nextBatchSize) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); + } + } + + if (inDictionaryStream == null) { + Arrays.fill(inDictionary, true); + } + else { + inDictionaryStream.getSetBits(nextBatchSize, inDictionary, longVector.isNull); + } + + for (int i = 0; i < nextBatchSize; i++) { + if (!longVector.isNull[i]) { + if (inDictionary[i]) { + longVector.vector[i] = dictionary[((int) longVector.vector[i])]; + } + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + // read the dictionary + if (!dictionaryOpen && dictionarySize > 0) { + if (dictionary.length < dictionarySize) { + dictionary = new long[dictionarySize]; + } + + LongStream dictionaryStream = dictionaryDataStreamSource.openStream(); + if (dictionaryStream == null) { + throw new OrcCorruptionException("Dictionary is not empty but data stream is not present"); + } + dictionaryStream.nextLongVector(dictionarySize, dictionary); + } + dictionaryOpen = true; + + presentStream = presentStreamSource.openStream(); + inDictionaryStream = inDictionaryStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + dictionaryDataStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, LongStream.class); + dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); + dictionaryOpen = false; + + inDictionaryStreamSource = missingStreamSource(BooleanStream.class); + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + inDictionaryStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + inDictionaryStreamSource = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + inDictionaryStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java new file mode 100644 index 0000000000..f10c6e0d4f --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java @@ -0,0 +1,162 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.LongVector; +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class LongDirectStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(LongStream.class); + @Nullable + private LongStream dataStream; + + private boolean rowGroupOpen; + + public LongDirectStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.skip(readOffset); + } + } + + LongVector longVector = castOrcVector(vector, LongVector.class); + if (presentStream == null) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + Arrays.fill(longVector.isNull, false); + dataStream.nextLongVector(nextBatchSize, longVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); + if (nullValues != nextBatchSize) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java new file mode 100644 index 0000000000..8fc4a4d0dd --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java @@ -0,0 +1,88 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.io.IOException; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class LongStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + private final LongDirectStreamReader directReader; + private final LongDictionaryStreamReader dictionaryReader; + private StreamReader currentReader; + + public LongStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + directReader = new LongDirectStreamReader(streamDescriptor); + dictionaryReader = new LongDictionaryStreamReader(streamDescriptor); + } + + @Override + public void prepareNextRead(int batchSize) + { + currentReader.prepareNextRead(batchSize); + } + + @Override + public void readBatch(Object vector) + throws IOException + { + currentReader.readBatch(vector); + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + ColumnEncodingKind kind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); + if (kind == DIRECT || kind == DIRECT_V2 || kind == DWRF_DIRECT) { + currentReader = directReader; + } + else if (kind == DICTIONARY) { + currentReader = dictionaryReader; + } + else { + throw new IllegalArgumentException("Unsupported encoding " + kind); + } + + currentReader.startStripe(dictionaryStreamSources, encoding); + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + currentReader.startRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/OrcReaderUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/OrcReaderUtils.java new file mode 100644 index 0000000000..84348d18c1 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/OrcReaderUtils.java @@ -0,0 +1,32 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; + +final class OrcReaderUtils +{ + private OrcReaderUtils() + { + } + + public static T castOrcVector(Object vector, Class type) + throws OrcCorruptionException + { + if (!type.isInstance(vector)) { + throw new OrcCorruptionException("Expected %s, but got %s", type.getSimpleName(), vector.getClass().getName()); + } + return type.cast(vector); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java new file mode 100644 index 0000000000..22c6a4d911 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java @@ -0,0 +1,286 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.SliceVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.*; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class SliceDictionaryStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; + + @Nonnull + private StreamSource dictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); + private boolean dictionaryOpen; + private int dictionarySize; + @Nonnull + private Slice[] dictionary = new Slice[0]; + + @Nonnull + private StreamSource dictionaryLengthStreamSource = missingStreamSource(LongStream.class); + @Nonnull + private int[] dictionaryLength = new int[0]; + + @Nonnull + private StreamSource inDictionaryStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream inDictionaryStream; + private final boolean[] inDictionary = new boolean[Vector.MAX_VECTOR_LENGTH]; + + @Nonnull + private StreamSource rowGroupDictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); + @Nonnull + private Slice[] rowGroupDictionary = new Slice[0]; + + @Nonnull + private StreamSource rowGroupDictionaryLengthStreamSource = missingStreamSource(RowGroupDictionaryLengthStream.class); + @Nonnull + private int[] rowGroupDictionaryLength = new int[0]; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(LongStream.class); + @Nullable + private LongStream dataStream; + @Nonnull + private final int[] dataVector = new int[Vector.MAX_VECTOR_LENGTH]; + + private boolean rowGroupOpen; + + public SliceDictionaryStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the length reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + if (inDictionaryStream != null) { + inDictionaryStream.skip(readOffset); + } + dataStream.skip(readOffset); + } + } + + SliceVector sliceVector = castOrcVector(vector, SliceVector.class); + + if (presentStream == null) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + Arrays.fill(isNullVector, false); + dataStream.nextIntVector(nextBatchSize, dataVector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, isNullVector); + if (nullValues != nextBatchSize) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.nextIntVector(nextBatchSize, dataVector, isNullVector); + } + } + + if (inDictionaryStream == null) { + Arrays.fill(inDictionary, true); + } + else { + inDictionaryStream.getSetBits(nextBatchSize, inDictionary, isNullVector); + } + + for (int i = 0; i < nextBatchSize; i++) { + if (isNullVector[i]) { + sliceVector.vector[i] = null; + } + else if (inDictionary[i]) { + sliceVector.vector[i] = dictionary[dataVector[i]]; + } + else { + sliceVector.vector[i] = rowGroupDictionary[dataVector[i]]; + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + // read the dictionary + if (!dictionaryOpen && dictionarySize > 0) { + // resize the dictionary array if necessary + if (dictionary.length < dictionarySize) { + dictionary = new Slice[dictionarySize]; + dictionaryLength = new int[dictionarySize]; + } + + // read the lengths + LongStream lengthStream = dictionaryLengthStreamSource.openStream(); + if (lengthStream == null) { + throw new OrcCorruptionException("Dictionary is not empty but dictionary length stream is not present"); + } + lengthStream.nextIntVector(dictionarySize, dictionaryLength); + + ByteArrayStream dictionaryDataStream = dictionaryDataStreamSource.openStream(); + readDictionary(dictionaryDataStream, dictionarySize, dictionaryLength, dictionary); + } + dictionaryOpen = true; + + // read row group dictionary + RowGroupDictionaryLengthStream dictionaryLengthStream = rowGroupDictionaryLengthStreamSource.openStream(); + if (dictionaryLengthStream != null) { + int rowGroupDictionarySize = dictionaryLengthStream.getEntryCount(); + + // resize the dictionary array if necessary + if (rowGroupDictionary.length < rowGroupDictionarySize) { + rowGroupDictionary = new Slice[rowGroupDictionarySize]; + rowGroupDictionaryLength = new int[rowGroupDictionarySize]; + } + + // read the lengths + dictionaryLengthStream.nextIntVector(rowGroupDictionarySize, rowGroupDictionaryLength); + + ByteArrayStream dictionaryDataStream = rowGroupDictionaryDataStreamSource.openStream(); + readDictionary(dictionaryDataStream, rowGroupDictionarySize, rowGroupDictionaryLength, rowGroupDictionary); + } + dictionaryOpen = true; + + presentStream = presentStreamSource.openStream(); + inDictionaryStream = inDictionaryStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + private static void readDictionary(@Nullable ByteArrayStream dictionaryDataStream, int dictionarySize, int[] dictionaryLength, Slice[] dictionary) + throws IOException + { + // build dictionary slices + for (int i = 0; i < dictionarySize; i++) { + int length = dictionaryLength[i]; + if (length == 0) { + dictionary[i] = Slices.EMPTY_SLICE; + } + else { + dictionary[i] = Slices.wrappedBuffer(dictionaryDataStream.next(length)); + } + } + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + dictionaryDataStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, ByteArrayStream.class); + dictionaryLengthStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class); + dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); + dictionaryOpen = false; + + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(LongStream.class); + + inDictionaryStreamSource = missingStreamSource(BooleanStream.class); + rowGroupDictionaryLengthStreamSource = missingStreamSource(RowGroupDictionaryLengthStream.class); + rowGroupDictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + inDictionaryStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); + + // the "in dictionary" stream signals if the value is in the stripe or row group dictionary + inDictionaryStreamSource = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class); + rowGroupDictionaryLengthStreamSource = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY_LENGTH, RowGroupDictionaryLengthStream.class); + rowGroupDictionaryDataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY, ByteArrayStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + inDictionaryStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java new file mode 100644 index 0000000000..f7f098933d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java @@ -0,0 +1,211 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.SliceVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.*; +import com.google.common.primitives.Ints; +import io.airlift.slice.Slices; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class SliceDirectStreamReader + implements StreamReader +{ + private static final byte[] EMPTY_BYTE_ARRAY = new byte[0]; + + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; + + @Nonnull + private StreamSource lengthStreamSource = missingStreamSource(LongStream.class); + @Nullable + private LongStream lengthStream; + private final int[] lengthVector = new int[Vector.MAX_VECTOR_LENGTH]; + + @Nonnull + private StreamSource dataByteSource = missingStreamSource(ByteArrayStream.class); + @Nullable + private ByteArrayStream dataStream; + + private boolean rowGroupOpen; + + public SliceDirectStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the length reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + if (lengthStream == null) { + throw new OrcCorruptionException("Value is not null but length stream is not present"); + } + long dataSkipSize = lengthStream.sum(readOffset); + if (dataSkipSize > 0) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + dataStream.skip(Ints.checkedCast(dataSkipSize)); + } + } + } + + SliceVector sliceVector = castOrcVector(vector, SliceVector.class); + if (presentStream == null) { + if (lengthStream == null) { + throw new OrcCorruptionException("Value is not null but length stream is not present"); + } + lengthStream.nextIntVector(nextBatchSize, lengthVector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, isNullVector); + if (nullValues != nextBatchSize) { + if (lengthStream == null) { + throw new OrcCorruptionException("Value is not null but length stream is not present"); + } + lengthStream.nextIntVector(nextBatchSize, lengthVector, isNullVector); + } + } + + int totalLength = 0; + for (int i = 0; i < nextBatchSize; i++) { + if (!isNullVector[i]) { + totalLength += lengthVector[i]; + } + } + + byte[] data = EMPTY_BYTE_ARRAY; + if (totalLength > 0) { + if (dataStream == null) { + throw new OrcCorruptionException("Value is not null but data stream is not present"); + } + data = dataStream.next(totalLength); + } + + int offset = 0; + for (int i = 0; i < nextBatchSize; i++) { + if (!isNullVector[i]) { + int length = lengthVector[i]; + sliceVector.vector[i] = Slices.wrappedBuffer(data, offset, length); + offset += length; + } + else { + sliceVector.vector[i] = null; + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + lengthStream = lengthStreamSource.openStream(); + dataStream = dataByteSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + lengthStreamSource = missingStreamSource(LongStream.class); + dataByteSource = missingStreamSource(ByteArrayStream.class); + + readOffset = 0; + nextBatchSize = 0; + + Arrays.fill(isNullVector, false); + + presentStream = null; + lengthStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + lengthStreamSource = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class); + dataByteSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteArrayStream.class); + + readOffset = 0; + nextBatchSize = 0; + + Arrays.fill(isNullVector, false); + + presentStream = null; + lengthStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java new file mode 100644 index 0000000000..660410f52f --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java @@ -0,0 +1,88 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.io.IOException; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class SliceStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + private final SliceDirectStreamReader directReader; + private final SliceDictionaryStreamReader dictionaryReader; + private StreamReader currentReader; + + public SliceStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + directReader = new SliceDirectStreamReader(streamDescriptor); + dictionaryReader = new SliceDictionaryStreamReader(streamDescriptor); + } + + @Override + public void readBatch(Object vector) + throws IOException + { + currentReader.readBatch(vector); + } + + @Override + public void prepareNextRead(int batchSize) + { + currentReader.prepareNextRead(batchSize); + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + ColumnEncodingKind columnEncodingKind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); + if (columnEncodingKind == DIRECT || columnEncodingKind == DIRECT_V2 || columnEncodingKind == DWRF_DIRECT) { + currentReader = directReader; + } + else if (columnEncodingKind == DICTIONARY || columnEncodingKind == DICTIONARY_V2) { + currentReader = dictionaryReader; + } + else { + throw new IllegalArgumentException("Unsupported encoding " + columnEncodingKind); + } + + currentReader.startStripe(dictionaryStreamSources, encoding); + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + currentReader.startRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReader.java new file mode 100644 index 0000000000..e7ea384be5 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReader.java @@ -0,0 +1,34 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.io.IOException; +import java.util.List; + +public interface StreamReader +{ + void readBatch(Object vector) + throws IOException; + + void prepareNextRead(int batchSize); + + void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException; + + void startRowGroup(StreamSources dataStreamSources) + throws IOException; +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java new file mode 100644 index 0000000000..07913335ec --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java @@ -0,0 +1,230 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.LongVector; +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class TimestampStreamReader + implements StreamReader +{ + private static final int MILLIS_PER_SECOND = 1000; + + private final StreamDescriptor streamDescriptor; + private final long baseTimestampInSeconds; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource secondsStreamSource = missingStreamSource(LongStream.class); + @Nullable + private LongStream secondsStream; + + @Nonnull + private StreamSource nanosStreamSource = missingStreamSource(LongStream.class); + @Nullable + private LongStream nanosStream; + + private final long[] nanosVector = new long[Vector.MAX_VECTOR_LENGTH]; + + private boolean rowGroupOpen; + + public TimestampStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.baseTimestampInSeconds = new DateTime(2015, 1, 1, 0, 0, checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null")).getMillis() / MILLIS_PER_SECOND; + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + if (secondsStream == null) { + throw new OrcCorruptionException("Value is not null but seconds stream is not present"); + } + if (nanosStream == null) { + throw new OrcCorruptionException("Value is not null but nanos stream is not present"); + } + + secondsStream.skip(readOffset); + nanosStream.skip(readOffset); + } + } + + LongVector longVector = castOrcVector(vector, LongVector.class); + if (presentStream == null) { + if (secondsStream == null) { + throw new OrcCorruptionException("Value is not null but seconds stream is not present"); + } + if (nanosStream == null) { + throw new OrcCorruptionException("Value is not null but nanos stream is not present"); + } + + Arrays.fill(longVector.isNull, false); + secondsStream.nextLongVector(nextBatchSize, longVector.vector); + nanosStream.nextLongVector(nextBatchSize, nanosVector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); + if (nullValues != nextBatchSize) { + if (secondsStream == null) { + throw new OrcCorruptionException("Value is not null but seconds stream is not present"); + } + if (nanosStream == null) { + throw new OrcCorruptionException("Value is not null but nanos stream is not present"); + } + + secondsStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); + nanosStream.nextLongVector(nextBatchSize, nanosVector, longVector.isNull); + } + } + + // merge seconds and nanos together + for (int i = 0; i < nextBatchSize; i++) { + longVector.vector[i] = decodeTimestamp(longVector.vector[i], nanosVector[i], baseTimestampInSeconds); + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + secondsStream = secondsStreamSource.openStream(); + nanosStream = nanosStreamSource.openStream(); + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + secondsStreamSource = missingStreamSource(LongStream.class); + nanosStreamSource = missingStreamSource(LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + secondsStream = null; + nanosStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + secondsStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); + nanosStreamSource = dataStreamSources.getStreamSource(streamDescriptor, SECONDARY, LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + secondsStream = null; + nanosStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } + + // This comes from the Apache Hive ORC code + public static long decodeTimestamp(long seconds, long serializedNanos, long baseTimestampInSeconds) + { + long millis = (seconds + baseTimestampInSeconds) * MILLIS_PER_SECOND; + long nanos = parseNanos(serializedNanos); + + // the rounding error exists because java always rounds up when dividing integers + // -42001/1000 = -42; and -42001 % 1000 = -1 (+ 1000) + // to get the correct value we need + // (-42 - 1)*1000 + 999 = -42001 + // (42)*1000 + 1 = 42001 + if (millis < 0 && nanos != 0) { + millis -= 1000; + } + // Truncate nanos to millis and add to mills + return millis + (nanos / 1000000); + } + + // This comes from the Apache Hive ORC code + private static int parseNanos(long serialized) + { + int zeros = ((int) serialized) & 0x7; // 0b111 + int result = (int) (serialized >>> 3); + if (zeros != 0) { + for (int i = 0; i <= zeros; ++i) { + result *= 10; + } + } + return result; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/BooleanStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/BooleanStream.java new file mode 100644 index 0000000000..6d7322e45a --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/BooleanStream.java @@ -0,0 +1,211 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.BooleanStreamCheckpoint; + +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkState; + +public class BooleanStream + implements ValueStream +{ + private static final int HIGH_BIT_MASK = 0x80; // was 0b1000_0000 for JDK 7 + private final ByteStream byteStream; + private byte data; + private int bitsInData; + + public BooleanStream(OrcInputStream byteStream) + { + this.byteStream = new ByteStream(byteStream); + } + + private void readByte() + throws IOException + { + checkState(bitsInData == 0); + data = byteStream.next(); + bitsInData = 8; + } + + public boolean nextBit() + throws IOException + { + // read more data if necessary + if (bitsInData == 0) { + readByte(); + } + + // read bit + boolean result = (data & HIGH_BIT_MASK) != 0; + + // mark bit consumed + data <<= 1; + bitsInData--; + + return result; + } + + @Override + public Class getCheckpointType() + { + return BooleanStreamCheckpoint.class; + } + + @Override + public void seekToCheckpoint(BooleanStreamCheckpoint checkpoint) + throws IOException + { + byteStream.seekToCheckpoint(checkpoint.getByteStreamCheckpoint()); + bitsInData = 0; + skip(checkpoint.getOffset()); + } + + @Override + public void skip(int items) + throws IOException + { + if (bitsInData >= items) { + data <<= items; + bitsInData -= items; + } + else { + items -= bitsInData; + bitsInData = 0; + + byteStream.skip(items >>> 3); + items = items & 0x07; // 0b111; + + if (items != 0) { + readByte(); + data <<= items; + bitsInData -= items; + } + } + } + + public int countBitsSet(int items) + throws IOException + { + int count = 0; + + // count buffered data + if (items > bitsInData && bitsInData > 0) { + count += bitCount(data); + items -= bitsInData; + bitsInData = 0; + } + + // count whole bytes + while (items > 8) { + count += bitCount(byteStream.next()); + items -= 8; + } + + // count remaining bits + for (int i = 0; i < items; i++) { + // read more data if necessary + if (bitsInData == 0) { + readByte(); + } + + // read bit + if ((data & HIGH_BIT_MASK) != 0) { + count++; + } + + // mark bit consumed + data <<= 1; + bitsInData--; + } + + return count; + } + + /** + * Sets the vector element to true if the bit is set. + */ + public void getSetBits(int batchSize, boolean[] vector) + throws IOException + { + for (int i = 0; i < batchSize; i++) { + // read more data if necessary + if (bitsInData == 0) { + readByte(); + } + + // read bit + vector[i] = (data & HIGH_BIT_MASK) != 0; + + // mark bit consumed + data <<= 1; + bitsInData--; + } + } + + /** + * Sets the vector element to true if the bit is set, skipping the null values. + */ + public void getSetBits(int batchSize, boolean[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < batchSize; i++) { + if (!isNull[i]) { + // read more data if necessary + if (bitsInData == 0) { + readByte(); + } + + // read bit + vector[i] = (data & HIGH_BIT_MASK) != 0; + + // mark bit consumed + data <<= 1; + bitsInData--; + } + } + } + + /** + * Sets the vector element to true if the bit is not set. + */ + public int getUnsetBits(int batchSize, boolean[] vector) + throws IOException + { + int count = 0; + for (int i = 0; i < batchSize; i++) { + // read more data if necessary + if (bitsInData == 0) { + readByte(); + } + + // read bit + vector[i] = (data & HIGH_BIT_MASK) == 0; + if (vector[i]) { + count++; + } + + // mark bit consumed + data <<= 1; + bitsInData--; + } + return count; + } + + private static int bitCount(byte data) + { + return Integer.bitCount(data & 0xFF); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java new file mode 100644 index 0000000000..321e0ff4e4 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java @@ -0,0 +1,67 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.ByteArrayStreamCheckpoint; + +import java.io.IOException; + +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; +import static com.google.common.base.Preconditions.checkNotNull; + +public class ByteArrayStream + implements ValueStream +{ + private final OrcInputStream inputStream; + + public ByteArrayStream(OrcInputStream inputStream) + { + this.inputStream = checkNotNull(inputStream, "inputStream is null"); + } + + public byte[] next(int length) + throws IOException + { + byte[] data = new byte[length]; + readFully(inputStream, data, 0, length); + return data; + } + + public void next(int length, byte[] data) + throws IOException + { + readFully(inputStream, data, 0, length); + } + + @Override + public Class getCheckpointType() + { + return ByteArrayStreamCheckpoint.class; + } + + @Override + public void seekToCheckpoint(ByteArrayStreamCheckpoint checkpoint) + throws IOException + { + inputStream.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); + } + + @Override + public void skip(int skipSize) + throws IOException + { + skipFully(inputStream, skipSize); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java new file mode 100644 index 0000000000..8469283fb5 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java @@ -0,0 +1,138 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.ByteStreamCheckpoint; + +import java.io.IOException; +import java.util.Arrays; + +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; + +public class ByteStream + implements ValueStream +{ + private final OrcInputStream input; + private final byte[] buffer = new byte[MIN_REPEAT_SIZE + 127]; + private int length; + private int offset; + private long lastReadInputCheckpoint; + + public ByteStream(OrcInputStream input) + { + this.input = input; + lastReadInputCheckpoint = input.getCheckpoint(); + } + + // This is based on the Apache Hive ORC code + private void readNextBlock() + throws IOException + { + lastReadInputCheckpoint = input.getCheckpoint(); + + int control = input.read(); + if (control == -1) { + throw new OrcCorruptionException("Read past end of buffer RLE byte from %s", input); + } + + offset = 0; + + // if byte high bit is not set, this is a repetition; otherwise it is a literal sequence + if ((control & 0x80) == 0) { + length = control + MIN_REPEAT_SIZE; + + // read the repeated value + int value = input.read(); + if (value == -1) { + throw new OrcCorruptionException("Reading RLE byte got EOF"); + } + + // fill buffer with the value + Arrays.fill(buffer, 0, length, (byte) value); + } + else { + // length is 2's complement of byte + length = 0x100 - control; + + // read the literals into the buffer + readFully(input, buffer, 0, length); + } + } + + @Override + public Class getCheckpointType() + { + return ByteStreamCheckpoint.class; + } + + @Override + public void seekToCheckpoint(ByteStreamCheckpoint checkpoint) + throws IOException + { + // if the checkpoint is within the current buffer, just adjust the pointer + if (lastReadInputCheckpoint == checkpoint.getInputStreamCheckpoint() && checkpoint.getOffset() <= length) { + offset = checkpoint.getOffset(); + } + else { + // otherwise, discard the buffer and start over + input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); + length = 0; + offset = 0; + skip(checkpoint.getOffset()); + } + } + + @Override + public void skip(int items) + throws IOException + { + while (items > 0) { + if (offset == length) { + readNextBlock(); + } + long consume = Math.min(items, length - offset); + offset += consume; + items -= consume; + } + } + + public byte next() + throws IOException + { + if (offset == length) { + readNextBlock(); + } + return buffer[offset++]; + } + + public void nextVector(long items, long[] vector) + throws IOException + { + for (int i = 0; i < items; i++) { + vector[i] = next(); + } + } + + public void nextVector(long items, long[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = next(); + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java new file mode 100644 index 0000000000..89ee357507 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java @@ -0,0 +1,69 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; +import com.google.common.base.MoreObjects; + +import javax.annotation.Nullable; +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkNotNull; + +public class CheckpointStreamSource, C extends StreamCheckpoint> + implements StreamSource +{ + public static , C extends StreamCheckpoint> CheckpointStreamSource createCheckpointStreamSource(S stream, StreamCheckpoint checkpoint) + { + checkNotNull(stream, "stream is null"); + checkNotNull(checkpoint, "checkpoint is null"); + + Class checkpointType = stream.getCheckpointType(); + C verifiedCheckpoint = OrcStreamUtils.checkType(checkpoint, checkpointType, "Checkpoint"); + return new CheckpointStreamSource(stream, verifiedCheckpoint); + } + + private final S stream; + private final C checkpoint; + + public CheckpointStreamSource(S stream, C checkpoint) + { + this.stream = checkNotNull(stream, "stream is null"); + this.checkpoint = checkNotNull(checkpoint, "checkpoint is null"); + } + + @Override + public Class getStreamType() + { + return (Class) stream.getClass(); + } + + @Nullable + @Override + public S openStream() + throws IOException + { + stream.seekToCheckpoint(checkpoint); + return stream; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("stream", stream) + .add("checkpoint", checkpoint) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java new file mode 100644 index 0000000000..1344bc66c0 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java @@ -0,0 +1,104 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.DoubleStreamCheckpoint; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; + +import java.io.IOException; + +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; +import static com.google.common.base.Preconditions.checkPositionIndex; +import static io.airlift.slice.SizeOf.SIZE_OF_DOUBLE; + +public class DoubleStream + implements ValueStream +{ + private final OrcInputStream input; + private final byte[] buffer = new byte[Vector.MAX_VECTOR_LENGTH * SIZE_OF_DOUBLE]; + private final Slice slice = Slices.wrappedBuffer(buffer); + + public DoubleStream(OrcInputStream input) + { + this.input = input; + } + + @Override + public Class getCheckpointType() + { + return DoubleStreamCheckpoint.class; + } + + @Override + public void seekToCheckpoint(DoubleStreamCheckpoint checkpoint) + throws IOException + { + input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); + } + + @Override + public void skip(int items) + throws IOException + { + long length = items * SIZE_OF_DOUBLE; + skipFully(input, length); + } + + public double next() + throws IOException + { + readFully(input, buffer, 0, SIZE_OF_DOUBLE); + return slice.getDouble(0); + } + + public void nextVector(int items, double[] vector) + throws IOException + { + checkPositionIndex(items, vector.length); + checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); + + // buffer that number of values + readFully(input, buffer, 0, items * SIZE_OF_DOUBLE); + + // copy values directly into vector + Slices.wrappedDoubleArray(vector).setBytes(0, slice, 0, items * SIZE_OF_DOUBLE); + } + + public void nextVector(long items, double[] vector, boolean[] isNull) + throws IOException + { + // count the number of non nulls + int notNullCount = 0; + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + notNullCount++; + } + } + + // buffer that umber of values + readFully(input, buffer, 0, notNullCount * SIZE_OF_DOUBLE); + + // load them into the buffer + int elementIndex = 0; + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = slice.getDouble(elementIndex); + elementIndex += SIZE_OF_DOUBLE; + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java new file mode 100644 index 0000000000..b60bd46103 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java @@ -0,0 +1,109 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.FloatStreamCheckpoint; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; + +import java.io.IOException; + +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; +import static com.google.common.base.Preconditions.checkPositionIndex; +import static io.airlift.slice.SizeOf.SIZE_OF_FLOAT; + +public class FloatStream + implements ValueStream +{ + private final OrcInputStream input; + private final byte[] buffer = new byte[Vector.MAX_VECTOR_LENGTH * SIZE_OF_FLOAT]; + private final Slice slice = Slices.wrappedBuffer(buffer); + + public FloatStream(OrcInputStream input) + { + this.input = input; + } + + @Override + public Class getCheckpointType() + { + return FloatStreamCheckpoint.class; + } + + @Override + public void seekToCheckpoint(FloatStreamCheckpoint checkpoint) + throws IOException + { + input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); + } + + @Override + public void skip(int items) + throws IOException + { + long length = items * SIZE_OF_FLOAT; + skipFully(input, length); + } + + public float next() + throws IOException + { + readFully(input, buffer, 0, SIZE_OF_FLOAT); + return slice.getFloat(0); + } + + public void nextVector(int items, double[] vector) + throws IOException + { + checkPositionIndex(items, vector.length); + checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); + + // buffer that number of values + readFully(input, buffer, 0, items * SIZE_OF_FLOAT); + + // load them into the buffer one at a time since we are reading + // floats into a double vector + int elementIndex = 0; + for (int i = 0; i < items; i++) { + vector[i] = slice.getFloat(elementIndex); + elementIndex += SIZE_OF_FLOAT; + } + } + + public void nextVector(long items, double[] vector, boolean[] isNull) + throws IOException + { + // count the number of non nulls + int notNullCount = 0; + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + notNullCount++; + } + } + + // buffer that umber of values + readFully(input, buffer, 0, notNullCount * SIZE_OF_FLOAT); + + // load them into the buffer + int elementIndex = 0; + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = slice.getFloat(elementIndex); + elementIndex += SIZE_OF_FLOAT; + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java new file mode 100644 index 0000000000..6ad58b8776 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java @@ -0,0 +1,178 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; + +import java.io.IOException; +import java.io.InputStream; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.FixedBitSizes.*; + +// This is based on the Apache Hive ORC code +public final class LongDecode +{ + private LongDecode() + { + } + + enum FixedBitSizes + { + ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, + THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, + TWENTY, TWENTY_ONE, TWENTY_TWO, TWENTY_THREE, TWENTY_FOUR, TWENTY_SIX, + TWENTY_EIGHT, THIRTY, THIRTY_TWO, FORTY, FORTY_EIGHT, FIFTY_SIX, SIXTY_FOUR; + } + + /** + * Decodes the ordinal fixed bit value to actual fixed bit width value. + */ + public static int decodeBitWidth(int n) + { + if (n >= ONE.ordinal() && n <= TWENTY_FOUR.ordinal()) { + return n + 1; + } + else if (n == TWENTY_SIX.ordinal()) { + return 26; + } + else if (n == TWENTY_EIGHT.ordinal()) { + return 28; + } + else if (n == THIRTY.ordinal()) { + return 30; + } + else if (n == THIRTY_TWO.ordinal()) { + return 32; + } + else if (n == FORTY.ordinal()) { + return 40; + } + else if (n == FORTY_EIGHT.ordinal()) { + return 48; + } + else if (n == FIFTY_SIX.ordinal()) { + return 56; + } + else { + return 64; + } + } + + /** + * Gets the closest supported fixed bit width for the specified bit width. + */ + public static int getClosestFixedBits(int width) + { + if (width == 0) { + return 1; + } + + if (width >= 1 && width <= 24) { + return width; + } + else if (width > 24 && width <= 26) { + return 26; + } + else if (width > 26 && width <= 28) { + return 28; + } + else if (width > 28 && width <= 30) { + return 30; + } + else if (width > 30 && width <= 32) { + return 32; + } + else if (width > 32 && width <= 40) { + return 40; + } + else if (width > 40 && width <= 48) { + return 48; + } + else if (width > 48 && width <= 56) { + return 56; + } + else { + return 64; + } + } + + public static long readSignedVInt(InputStream inputStream) + throws IOException + { + long result = readUnsignedVInt(inputStream); + return (result >>> 1) ^ -(result & 1); + } + + public static long readUnsignedVInt(InputStream inputStream) + throws IOException + { + long result = 0; + int offset = 0; + long b; + do { + b = inputStream.read(); + if (b == -1) { + throw new OrcCorruptionException("EOF while reading unsigned vint"); + } + result |= (b & 0x7F /* 0b0111_1111 */) << offset; + offset += 7; + } while ((b & 0x80 /* 0b1000_0000 */) != 0); + return result; + } + + public static long readVInt(boolean signed, InputStream inputStream) + throws IOException + { + if (signed) { + return readSignedVInt(inputStream); + } + else { + return readUnsignedVInt(inputStream); + } + } + + public static long zigzagDecode(long value) + { + return (value >>> 1) ^ -(value & 1); + } + + public static long readDwrfLong(InputStream input, OrcTypeKind type, boolean signed, boolean usesVInt) + throws IOException + { + if (usesVInt) { + return readVInt(signed, input); + } + else if (type == SHORT) { + return input.read() | (input.read() << 8); + } + else if (type == INT) { + return input.read() | (input.read() << 8) | (input.read() << 16) | (input.read() << 24); + } + else if (type == LONG) { + return ((long) input.read()) | + (((long) input.read()) << 8) | + (((long) input.read()) << 16) | + (((long) input.read()) << 24) | + (((long) input.read()) << 32) | + (((long) input.read()) << 40) | + (((long) input.read()) << 48) | + (((long) input.read()) << 56); + } + else { + throw new IllegalArgumentException(type + " type is not supported"); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStream.java new file mode 100644 index 0000000000..a6042dc978 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStream.java @@ -0,0 +1,40 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; + +import java.io.IOException; + +public interface LongStream + extends ValueStream +{ + long next() + throws IOException; + + void nextIntVector(int items, int[] vector) + throws IOException; + + void nextIntVector(int items, int[] vector, boolean[] isNull) + throws IOException; + + void nextLongVector(int items, long[] vector) + throws IOException; + + void nextLongVector(int items, long[] vector, boolean[] isNull) + throws IOException; + + long sum(int items) + throws IOException; +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java new file mode 100644 index 0000000000..16c9180fd4 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java @@ -0,0 +1,129 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamDwrfCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import com.google.common.primitives.Ints; + +import java.io.IOException; + +import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.readDwrfLong; +import static com.google.common.base.Preconditions.checkPositionIndex; + +public class LongStreamDwrf + implements LongStream +{ + private final OrcInputStream input; + private final OrcTypeKind orcTypeKind; + private final boolean signed; + private final boolean usesVInt; + + public LongStreamDwrf(OrcInputStream input, OrcTypeKind type, boolean signed, boolean usesVInt) + { + this.input = input; + this.orcTypeKind = type; + this.signed = signed; + this.usesVInt = usesVInt; + } + + @Override + public Class getCheckpointType() + { + return LongStreamDwrfCheckpoint.class; + } + + @Override + public void seekToCheckpoint(LongStreamCheckpoint checkpoint) + throws IOException + { + LongStreamDwrfCheckpoint dwrfCheckpoint = OrcStreamUtils.checkType(checkpoint, LongStreamDwrfCheckpoint.class, "Checkpoint"); + input.seekToCheckpoint(dwrfCheckpoint.getInputStreamCheckpoint()); + } + + @Override + public void skip(int items) + throws IOException + { + // there is no fast way to skip values + for (int i = 0; i < items; i++) { + next(); + } + } + + @Override + public long sum(int items) + throws IOException + { + long sum = 0; + for (int i = 0; i < items; i++) { + sum += next(); + } + return sum; + } + + @Override + public long next() + throws IOException + { + return readDwrfLong(input, orcTypeKind, signed, usesVInt); + } + + @Override + public void nextIntVector(int items, int[] vector) + throws IOException + { + checkPositionIndex(items, vector.length); + checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); + + for (int i = 0; i < items; i++) { + vector[i] = Ints.checkedCast(next()); + } + } + + @Override + public void nextIntVector(int items, int[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = Ints.checkedCast(next()); + } + } + } + + @Override + public void nextLongVector(int items, long[] vector) + throws IOException + { + checkPositionIndex(items, vector.length); + + for (int i = 0; i < items; i++) { + vector[i] = next(); + } + } + + @Override + public void nextLongVector(int items, long[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = next(); + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java new file mode 100644 index 0000000000..ee12910031 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java @@ -0,0 +1,188 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamV1Checkpoint; +import com.google.common.primitives.Ints; + +import java.io.IOException; + +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; + +public class LongStreamV1 + implements LongStream +{ + private static final int MAX_LITERAL_SIZE = 128; + + private final OrcInputStream input; + private final boolean signed; + private final long[] literals = new long[MAX_LITERAL_SIZE]; + private int numLiterals; + private int delta; + private int used; + private boolean repeat; + private long lastReadInputCheckpoint; + + public LongStreamV1(OrcInputStream input, boolean signed) + { + this.input = input; + this.signed = signed; + lastReadInputCheckpoint = input.getCheckpoint(); + } + + // This comes from the Apache Hive ORC code + private void readValues() + throws IOException + { + lastReadInputCheckpoint = input.getCheckpoint(); + + int control = input.read(); + if (control == -1) { + throw new OrcCorruptionException("Read past end of RLE integer from %s", input); + } + + if (control < 0x80) { + numLiterals = control + MIN_REPEAT_SIZE; + used = 0; + repeat = true; + delta = input.read(); + if (delta == -1) { + throw new OrcCorruptionException("End of stream in RLE Integer from %s", input); + } + + // convert from 0 to 255 to -128 to 127 by converting to a signed byte + // noinspection SillyAssignment + delta = (byte) delta; + literals[0] = LongDecode.readVInt(signed, input); + } + else { + numLiterals = 0x100 - control; + used = 0; + repeat = false; + for (int i = 0; i < numLiterals; ++i) { + literals[i] = LongDecode.readVInt(signed, input); + } + } + } + + @Override + // This comes from the Apache Hive ORC code + public long next() + throws IOException + { + long result; + if (used == numLiterals) { + readValues(); + } + if (repeat) { + result = literals[0] + (used++) * delta; + } + else { + result = literals[used++]; + } + return result; + } + + @Override + public Class getCheckpointType() + { + return LongStreamV1Checkpoint.class; + } + + @Override + public void seekToCheckpoint(LongStreamCheckpoint checkpoint) + throws IOException + { + LongStreamV1Checkpoint v1Checkpoint = OrcStreamUtils.checkType(checkpoint, LongStreamV1Checkpoint.class, "Checkpoint"); + + // if the checkpoint is within the current buffer, just adjust the pointer + if (lastReadInputCheckpoint == v1Checkpoint.getInputStreamCheckpoint() && v1Checkpoint.getOffset() <= numLiterals) { + used = v1Checkpoint.getOffset(); + } + else { + // otherwise, discard the buffer and start over + input.seekToCheckpoint(v1Checkpoint.getInputStreamCheckpoint()); + numLiterals = 0; + used = 0; + skip(v1Checkpoint.getOffset()); + } + } + + @Override + public void skip(int items) + throws IOException + { + while (items > 0) { + if (used == numLiterals) { + readValues(); + } + long consume = Math.min(items, numLiterals - used); + used += consume; + items -= consume; + } + } + + @Override + public long sum(int items) + throws IOException + { + long sum = 0; + for (int i = 0; i < items; i++) { + sum += next(); + } + return sum; + } + + @Override + public void nextLongVector(int items, long[] vector) + throws IOException + { + for (int i = 0; i < items; i++) { + vector[i] = next(); + } + } + + @Override + public void nextLongVector(int items, long[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = next(); + } + } + } + + @Override + public void nextIntVector(int items, int[] vector) + throws IOException + { + for (int i = 0; i < items; i++) { + vector[i] = Ints.checkedCast(next()); + } + } + + @Override + public void nextIntVector(int items, int[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = Ints.checkedCast(next()); + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java new file mode 100644 index 0000000000..87f554787c --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java @@ -0,0 +1,456 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamV2Checkpoint; +import com.google.common.primitives.Ints; + +import java.io.IOException; +import java.io.InputStream; + +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; + +/** + * @see {@link org.apache.hadoop.hive.ql.io.orc.RunLengthIntegerWriterV2} for description of various lightweight compression techniques. + */ +// This comes from the Apache Hive ORC code +public class LongStreamV2 + implements LongStream +{ + private static final int MAX_LITERAL_SIZE = 512; + + private enum EncodingType + { + SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA + } + + private final OrcInputStream input; + private final boolean signed; + private final long[] literals = new long[MAX_LITERAL_SIZE]; + private int numLiterals; + private int used; + private final boolean skipCorrupt; + private long lastReadInputCheckpoint; + + public LongStreamV2(OrcInputStream input, boolean signed, boolean skipCorrupt) + { + this.input = input; + this.signed = signed; + this.skipCorrupt = skipCorrupt; + lastReadInputCheckpoint = input.getCheckpoint(); + } + + // This comes from the Apache Hive ORC code + private void readValues() + throws IOException + { + lastReadInputCheckpoint = input.getCheckpoint(); + + // read the first 2 bits and determine the encoding type + int firstByte = input.read(); + if (firstByte < 0) { + throw new OrcCorruptionException("Read past end of RLE integer from %s", input); + } + + int enc = (firstByte >>> 6) & 0x03; + if (EncodingType.SHORT_REPEAT.ordinal() == enc) { + readShortRepeatValues(firstByte); + } + else if (EncodingType.DIRECT.ordinal() == enc) { + readDirectValues(firstByte); + } + else if (EncodingType.PATCHED_BASE.ordinal() == enc) { + readPatchedBaseValues(firstByte); + } + else { + readDeltaValues(firstByte); + } + } + + // This comes from the Apache Hive ORC code + private void readDeltaValues(int firstByte) + throws IOException + { + // extract the number of fixed bits + int fixedBits = (firstByte >>> 1) & 0x1f; + if (fixedBits != 0) { + fixedBits = LongDecode.decodeBitWidth(fixedBits); + } + + // extract the blob run length + int length = (firstByte & 0x01) << 8; + length |= input.read(); + + // read the first value stored as vint + long firstVal = LongDecode.readVInt(signed, input); + + // store first value to result buffer + literals[numLiterals++] = firstVal; + + // if fixed bits is 0 then all values have fixed delta + long prevVal; + if (fixedBits == 0) { + // read the fixed delta value stored as vint (deltas can be negative even + // if all number are positive) + long fixedDelta = LongDecode.readSignedVInt(input); + + // add fixed deltas to adjacent values + for (int i = 0; i < length; i++) { + literals[numLiterals++] = literals[numLiterals - 2] + fixedDelta; + } + } + else { + long deltaBase = LongDecode.readSignedVInt(input); + // add delta base and first value + literals[numLiterals++] = firstVal + deltaBase; + prevVal = literals[numLiterals - 1]; + length -= 1; + + // write the unpacked values, add it to previous value and store final + // value to result buffer. if the delta base value is negative then it + // is a decreasing sequence else an increasing sequence + readBitPackedLongs(literals, numLiterals, length, fixedBits, input); + while (length > 0) { + if (deltaBase < 0) { + literals[numLiterals] = prevVal - literals[numLiterals]; + } + else { + literals[numLiterals] = prevVal + literals[numLiterals]; + } + prevVal = literals[numLiterals]; + length--; + numLiterals++; + } + } + } + + // This comes from the Apache Hive ORC code + private void readPatchedBaseValues(int firstByte) + throws IOException + { + // extract the number of fixed bits + int fb = LongDecode.decodeBitWidth((firstByte >>> 1) & 0x1F); // 0b1_1111 + + // extract the run length of data blob + int length = (firstByte & 0x01) << 8; + length |= input.read(); + // runs are always one off + length += 1; + + // extract the number of bytes occupied by base + int thirdByte = input.read(); + int baseWidth = (thirdByte >>> 5) & 0x07; // 0b0111 + // base width is one off + baseWidth += 1; + + // extract patch width + int patchWidth = LongDecode.decodeBitWidth(thirdByte & 0x1F); // 0b1_1111 + + // read fourth byte and extract patch gap width + int fourthByte = input.read(); + int patchGapWidth = (fourthByte >>> 5) & 0x07; // 0b0111 + // patch gap width is one off + patchGapWidth += 1; + + // extract the length of the patch list + int patchListLength = fourthByte & 0x1F; // 0b1_1111 + + // read the next base width number of bytes to extract base value + long base = bytesToLongBE(input, baseWidth); + long mask = (1L << ((baseWidth * 8) - 1)); + // if MSB of base value is 1 then base is negative value else positive + if ((base & mask) != 0) { + base = base & ~mask; + base = -base; + } + + // unpack the data blob + long[] unpacked = new long[length]; + readBitPackedLongs(unpacked, 0, length, fb, input); + + // unpack the patch blob + long[] unpackedPatch = new long[patchListLength]; + + if ((patchWidth + patchGapWidth) > 64 && !skipCorrupt) { + throw new OrcCorruptionException("ORC file is corrupt"); + } + + int bitSize = LongDecode.getClosestFixedBits(patchWidth + patchGapWidth); + readBitPackedLongs(unpackedPatch, 0, patchListLength, bitSize, input); + + // apply the patch directly when decoding the packed data + int patchIndex = 0; + long currentGap; + long currentPatch; + long patchMask = ((1L << patchWidth) - 1); + currentGap = unpackedPatch[patchIndex] >>> patchWidth; + currentPatch = unpackedPatch[patchIndex] & patchMask; + long actualGap = 0; + + // special case: gap is >255 then patch value will be 0. + // if gap is <=255 then patch value cannot be 0 + while (currentGap == 255 && currentPatch == 0) { + actualGap += 255; + patchIndex++; + currentGap = unpackedPatch[patchIndex] >>> patchWidth; + currentPatch = unpackedPatch[patchIndex] & patchMask; + } + // add the left over gap + actualGap += currentGap; + + // unpack data blob, patch it (if required), add base to get final result + for (int i = 0; i < unpacked.length; i++) { + if (i == actualGap) { + // extract the patch value + long patchedValue = unpacked[i] | (currentPatch << fb); + + // add base to patched value + literals[numLiterals++] = base + patchedValue; + + // increment the patch to point to next entry in patch list + patchIndex++; + + if (patchIndex < patchListLength) { + // read the next gap and patch + currentGap = unpackedPatch[patchIndex] >>> patchWidth; + currentPatch = unpackedPatch[patchIndex] & patchMask; + actualGap = 0; + + // special case: gap is >255 then patch will be 0. if gap is + // <=255 then patch cannot be 0 + while (currentGap == 255 && currentPatch == 0) { + actualGap += 255; + patchIndex++; + currentGap = unpackedPatch[patchIndex] >>> patchWidth; + currentPatch = unpackedPatch[patchIndex] & patchMask; + } + // add the left over gap + actualGap += currentGap; + + // next gap is relative to the current gap + actualGap += i; + } + } + else { + // no patching required. add base to unpacked value to get final value + literals[numLiterals++] = base + unpacked[i]; + } + } + + } + + // This comes from the Apache Hive ORC code + private void readDirectValues(int firstByte) + throws IOException + { + // extract the number of fixed bits + int fixedBits = LongDecode.decodeBitWidth((firstByte >>> 1) & 0x1F); // 0b1_1111 + + // extract the run length + int length = (firstByte & 0x01) << 8; + length |= input.read(); + // runs are one off + length += 1; + + // write the unpacked values and zigzag decode to result buffer + readBitPackedLongs(literals, numLiterals, length, fixedBits, input); + if (signed) { + for (int i = 0; i < length; i++) { + literals[numLiterals] = LongDecode.zigzagDecode(literals[numLiterals]); + numLiterals++; + } + } + else { + numLiterals += length; + } + } + + // This comes from the Apache Hive ORC code + private void readShortRepeatValues(int firstByte) + throws IOException + { + // read the number of bytes occupied by the value + int size = (firstByte >>> 3) & 0x07; // 0b0111 + // #bytes are one off + size += 1; + + // read the run length + int length = firstByte & 0x07; + // run lengths values are stored only after MIN_REPEAT value is met + length += MIN_REPEAT_SIZE; + + // read the repeated value which is store using fixed bytes + long val = bytesToLongBE(input, size); + + if (signed) { + val = LongDecode.zigzagDecode(val); + } + + // repeat the value for length times + for (int i = 0; i < length; i++) { + literals[numLiterals++] = val; + } + } + + // This comes from the Apache Hive ORC code + private static void readBitPackedLongs(long[] buffer, int offset, int len, int bitSize, InputStream input) + throws IOException + { + int bitsLeft = 0; + int current = 0; + + for (int i = offset; i < (offset + len); i++) { + long result = 0; + int bitsLeftToRead = bitSize; + while (bitsLeftToRead > bitsLeft) { + result <<= bitsLeft; + result |= current & ((1 << bitsLeft) - 1); + bitsLeftToRead -= bitsLeft; + current = input.read(); + bitsLeft = 8; + } + + // handle the left over bits + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + bitsLeft -= bitsLeftToRead; + result |= (current >> bitsLeft) & ((1 << bitsLeftToRead) - 1); + } + buffer[i] = result; + } + } + + /** + * Read n bytes in big endian order and convert to long. + */ + private static long bytesToLongBE(InputStream input, int n) + throws IOException + { + long out = 0; + long val; + while (n > 0) { + n--; + // store it in a long and then shift else integer overflow will occur + val = input.read(); + out |= (val << (n * 8)); + } + return out; + } + + @Override + public long next() + throws IOException + { + if (used == numLiterals) { + numLiterals = 0; + used = 0; + readValues(); + } + return literals[used++]; + } + + @Override + public Class getCheckpointType() + { + return LongStreamV2Checkpoint.class; + } + + @Override + public void seekToCheckpoint(LongStreamCheckpoint checkpoint) + throws IOException + { + LongStreamV2Checkpoint v2Checkpoint = OrcStreamUtils.checkType(checkpoint, LongStreamV2Checkpoint.class, "Checkpoint"); + + // if the checkpoint is within the current buffer, just adjust the pointer + if (lastReadInputCheckpoint == v2Checkpoint.getInputStreamCheckpoint() && v2Checkpoint.getOffset() <= numLiterals) { + used = v2Checkpoint.getOffset(); + } + else { + // otherwise, discard the buffer and start over + input.seekToCheckpoint(v2Checkpoint.getInputStreamCheckpoint()); + numLiterals = 0; + used = 0; + skip(v2Checkpoint.getOffset()); + } + } + + @Override + public void skip(int items) + throws IOException + { + while (items > 0) { + if (used == numLiterals) { + numLiterals = 0; + used = 0; + readValues(); + } + long consume = Math.min(items, numLiterals - used); + used += consume; + items -= consume; + } + } + + @Override + public long sum(int items) + throws IOException + { + long sum = 0; + for (int i = 0; i < items; i++) { + sum += next(); + } + return sum; + } + + @Override + public void nextLongVector(int items, long[] vector) + throws IOException + { + for (int i = 0; i < items; i++) { + vector[i] = next(); + } + } + + @Override + public void nextLongVector(int items, long[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = next(); + } + } + } + + @Override + public void nextIntVector(int items, int[] vector) + throws IOException + { + for (int i = 0; i < items; i++) { + vector[i] = Ints.checkedCast(next()); + } + } + + @Override + public void nextIntVector(int items, int[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = Ints.checkedCast(next()); + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/MissingStreamSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/MissingStreamSource.java new file mode 100644 index 0000000000..3c9adcefb8 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/MissingStreamSource.java @@ -0,0 +1,46 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import javax.annotation.Nullable; +import java.io.IOException; + +public class MissingStreamSource> implements StreamSource +{ + private final Class streamType; + + public static > StreamSource missingStreamSource(Class streamType) + { + return new MissingStreamSource(streamType); + } + + private MissingStreamSource(Class streamType) + { + this.streamType = streamType; + } + + @Override + public Class getStreamType() + { + return streamType; + } + + @Nullable + @Override + public S openStream() + throws IOException + { + return null; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java new file mode 100644 index 0000000000..ec69c1a0c2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java @@ -0,0 +1,295 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import com.google.common.base.MoreObjects; +import com.google.common.primitives.Ints; +import io.airlift.slice.FixedLengthSliceInput; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.iq80.snappy.Snappy; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.zip.DataFormatException; +import java.util.zip.Inflater; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.*; +import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static io.airlift.slice.Slices.EMPTY_SLICE; +import static sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET; + +public final class OrcInputStream + extends InputStream +{ + public static final int EXPECTED_COMPRESSION_RATIO = 5; + private final String source; + private final FixedLengthSliceInput compressedSliceInput; + private final CompressionKind compressionKind; + private final int maxBufferSize; + + private int currentCompressedBlockOffset; + private FixedLengthSliceInput current; + + private byte[] buffer; + + public OrcInputStream(String source, FixedLengthSliceInput sliceInput, CompressionKind compressionKind, int bufferSize) + { + this.source = checkNotNull(source, "source is null"); + + checkNotNull(sliceInput, "sliceInput is null"); + + this.compressionKind = checkNotNull(compressionKind, "compressionKind is null"); + this.maxBufferSize = bufferSize; + + if (compressionKind == UNCOMPRESSED) { + this.current = sliceInput; + this.compressedSliceInput = EMPTY_SLICE.getInput(); + } + else { + checkArgument(compressionKind == SNAPPY || compressionKind == ZLIB, "%s compression not supported", compressionKind); + this.compressedSliceInput = checkNotNull(sliceInput, "compressedSliceInput is null"); + this.current = EMPTY_SLICE.getInput(); + } + } + + @Override + public void close() + throws IOException + { + current = null; + } + + @Override + public int available() + throws IOException + { + if (current == null) { + return 0; + } + return current.available(); + } + + @Override + public boolean markSupported() + { + return false; + } + + @Override + public int read() + throws IOException + { + if (current == null) { + return -1; + } + + int result = current.read(); + if (result != -1) { + return result; + } + + advance(); + return read(); + } + + @Override + public int read(byte[] b, int off, int length) + throws IOException + { + if (current == null) { + return -1; + } + + if (current.remaining() == 0) { + advance(); + if (current == null) { + return -1; + } + } + + return current.read(b, off, length); + } + + public long getCheckpoint() + { + // if the decompressed buffer is empty, return a checkpoint starting at the next block + if (current == null || (current.position() == 0 && current.remaining() == 0)) { + return createInputStreamCheckpoint(Ints.checkedCast(compressedSliceInput.position()), 0); + } + // otherwise return a checkpoint at the last compressed block read and the current position in the buffer + return createInputStreamCheckpoint(currentCompressedBlockOffset, Ints.checkedCast(current.position())); + } + + public boolean seekToCheckpoint(long checkpoint) + throws IOException + { + int compressedBlockOffset = decodeCompressedBlockOffset(checkpoint); + int decompressedOffset = decodeDecompressedOffset(checkpoint); + boolean discardedBuffer; + if (compressedBlockOffset != currentCompressedBlockOffset) { + if (compressionKind == UNCOMPRESSED) { + throw new OrcCorruptionException("Reset stream has a compressed block offset but stream is not compressed"); + } + compressedSliceInput.setPosition(compressedBlockOffset); + current = EMPTY_SLICE.getInput(); + discardedBuffer = true; + } + else { + discardedBuffer = false; + } + + if (decompressedOffset != current.position()) { + current.setPosition(0); + if (current.remaining() < decompressedOffset) { + decompressedOffset -= current.remaining(); + advance(); + } + current.setPosition(decompressedOffset); + } + return discardedBuffer; + } + + @Override + public long skip(long n) + throws IOException + { + if (current == null || n <= 0) { + return -1; + } + + long result = current.skip(n); + if (result != 0) { + return result; + } + if (read() == -1) { + return 0; + } + return 1 + current.skip(n - 1); + } + + // This comes from the Apache Hive ORC code + private void advance() + throws IOException + { + if (compressedSliceInput == null || compressedSliceInput.remaining() == 0) { + current = null; + return; + } + + // 3 byte header + // NOTE: this must match BLOCK_HEADER_SIZE + currentCompressedBlockOffset = Ints.checkedCast(compressedSliceInput.position()); + int b0 = compressedSliceInput.readUnsignedByte(); + int b1 = compressedSliceInput.readUnsignedByte(); + int b2 = compressedSliceInput.readUnsignedByte(); + + boolean isUncompressed = (b0 & 0x01) == 1; + int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >>> 1); + + Slice chunk = compressedSliceInput.readSlice(chunkLength); + + if (isUncompressed) { + current = chunk.getInput(); + } + else { + int uncompressedSize; + if (compressionKind == ZLIB) { + uncompressedSize = decompressZip(chunk); + } + else { + uncompressedSize = decompressSnappy(chunk); + } + + current = Slices.wrappedBuffer(buffer, 0, uncompressedSize).getInput(); + } + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("source", source) + .add("compressedOffset", compressedSliceInput.position()) + .add("uncompressedOffset", current == null ? null : current.position()) + .add("compression", compressionKind) + .toString(); + } + + // This comes from the Apache Hive ORC code + private int decompressZip(Slice in) + throws IOException + { + Inflater inflater = new Inflater(true); + try { + inflater.setInput((byte[]) in.getBase(), (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET), in.length()); + allocateOrGrowBuffer(in.length() * EXPECTED_COMPRESSION_RATIO, false); + int uncompressedLength = 0; + while (true) { + uncompressedLength += inflater.inflate(buffer, uncompressedLength, buffer.length - uncompressedLength); + if (inflater.finished() || buffer.length >= maxBufferSize) { + break; + } + int oldBufferSize = buffer.length; + allocateOrGrowBuffer(buffer.length * 2, true); + if (buffer.length <= oldBufferSize) { + throw new IllegalStateException(String.format("Buffer failed to grow. Old size %d, current size %d", oldBufferSize, buffer.length)); + } + } + + if (!inflater.finished()) { + throw new OrcCorruptionException("Could not decompress all input (output buffer too small?)"); + } + + return uncompressedLength; + } + catch (DataFormatException e) { + throw new OrcCorruptionException(e, "Invalid compressed stream"); + } + finally { + inflater.end(); + } + } + + private int decompressSnappy(Slice in) + throws IOException + { + byte[] inArray = (byte[]) in.getBase(); + int inOffset = (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET); + int inLength = in.length(); + + int uncompressedLength = Snappy.getUncompressedLength(inArray, inOffset); + checkArgument(uncompressedLength <= maxBufferSize, "Snappy requires buffer (%d) larger than max size (%d)", uncompressedLength, maxBufferSize); + allocateOrGrowBuffer(uncompressedLength, false); + + return Snappy.uncompress(inArray, inOffset, inLength, buffer, 0); + } + + private void allocateOrGrowBuffer(int size, boolean copyExistingData) + { + if (buffer == null || buffer.length < size) { + if (copyExistingData && buffer != null) { + buffer = Arrays.copyOfRange(buffer, 0, Math.min(size, maxBufferSize)); + } + else { + buffer = new byte[Math.min(size, maxBufferSize)]; + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java new file mode 100644 index 0000000000..58b8b86b2f --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java @@ -0,0 +1,66 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; + +import java.io.IOException; +import java.io.InputStream; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +final class OrcStreamUtils +{ + public static final int MIN_REPEAT_SIZE = 3; + + private OrcStreamUtils() + { + } + + public static void skipFully(InputStream input, long length) + throws IOException + { + while (length > 0) { + long result = input.skip(length); + if (result < 0) { + throw new OrcCorruptionException("Unexpected end of stream"); + } + length -= result; + } + } + + public static void readFully(InputStream input, byte[] buffer, int offset, int length) + throws IOException + { + while (offset < length) { + int result = input.read(buffer, offset, length - offset); + if (result < 0) { + throw new OrcCorruptionException("Unexpected end of stream"); + } + offset += result; + } + } + + static B checkType(A value, Class target, String name) + { + checkNotNull(value, "%s is null", name); + checkArgument(target.isInstance(value), + "%s must be of type %s, not %s", + name, + target.getName(), + value.getClass().getName()); + return target.cast(value); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/RowGroupDictionaryLengthStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/RowGroupDictionaryLengthStream.java new file mode 100644 index 0000000000..5cfc097aa8 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/RowGroupDictionaryLengthStream.java @@ -0,0 +1,52 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.RowGroupDictionaryLengthStreamCheckpoint; + +import java.io.IOException; + +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.checkType; + +public class RowGroupDictionaryLengthStream + extends LongStreamV1 +{ + private int entryCount = -1; + + public RowGroupDictionaryLengthStream(OrcInputStream input, boolean signed) + { + super(input, signed); + } + + public int getEntryCount() + { + return entryCount; + } + + @Override + public Class getCheckpointType() + { + return RowGroupDictionaryLengthStreamCheckpoint.class; + } + + @Override + public void seekToCheckpoint(LongStreamCheckpoint checkpoint) + throws IOException + { + super.seekToCheckpoint(checkpoint); + RowGroupDictionaryLengthStreamCheckpoint rowGroupDictionaryLengthStreamCheckpoint = checkType(checkpoint, RowGroupDictionaryLengthStreamCheckpoint.class, "Checkpoint"); + entryCount = rowGroupDictionaryLengthStreamCheckpoint.getRowGroupDictionarySize(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSource.java new file mode 100644 index 0000000000..4aba1e3000 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSource.java @@ -0,0 +1,26 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import javax.annotation.Nullable; +import java.io.IOException; + +public interface StreamSource> +{ + Class getStreamType(); + + @Nullable + S openStream() + throws IOException; +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java new file mode 100644 index 0000000000..2e8acf215a --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.StreamId; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; +import com.google.common.collect.ImmutableMap; + +import javax.annotation.Nonnull; +import java.util.Map; + +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +public class StreamSources +{ + private final Map> streamSources; + + public StreamSources(Map> streamSources) + { + this.streamSources = ImmutableMap.copyOf(checkNotNull(streamSources, "streamSources is null")); + } + + @Nonnull + public > StreamSource getStreamSource(StreamDescriptor streamDescriptor, StreamKind streamKind, Class streamType) + { + checkNotNull(streamDescriptor, "streamDescriptor is null"); + checkNotNull(streamType, "streamType is null"); + + StreamSource streamSource = streamSources.get(new StreamId(streamDescriptor.getStreamId(), streamKind)); + if (streamSource == null) { + streamSource = missingStreamSource(streamType); + } + + checkArgument(streamType.isAssignableFrom(streamSource.getStreamType()), + "%s must be of type %s, not %s", + streamDescriptor, + streamType.getName(), + streamSource.getStreamType().getName()); + + return (StreamSource) streamSource; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStream.java new file mode 100644 index 0000000000..dd15397187 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStream.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; + +import java.io.IOException; + +public interface ValueStream +{ + Class getCheckpointType(); + + void seekToCheckpoint(C checkpoint) + throws IOException; + + void skip(int items) + throws IOException; +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreamSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreamSource.java new file mode 100644 index 0000000000..45288e8387 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreamSource.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import com.google.common.base.MoreObjects; + +import javax.annotation.Nullable; +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkNotNull; + +public class ValueStreamSource> implements StreamSource +{ + private final S stream; + + public ValueStreamSource(S stream) + { + this.stream = checkNotNull(stream, "stream is null"); + } + + @Override + public Class getStreamType() + { + return (Class) stream.getClass(); + } + + @Nullable + @Override + public S openStream() + throws IOException + { + return stream; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("stream", stream) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreams.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreams.java new file mode 100644 index 0000000000..4953473369 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreams.java @@ -0,0 +1,146 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.StreamId; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; +import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind.INT; +import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind.TIMESTAMP; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; + +public final class ValueStreams +{ + private ValueStreams() + { + } + + public static ValueStream createValueStreams( + StreamId streamId, + OrcInputStream inputStream, + OrcTypeKind type, + ColumnEncodingKind encoding, + boolean usesVInt) + { + if (streamId.getStreamKind() == PRESENT) { + return new BooleanStream(inputStream); + } + + // dictionary length and data streams are unsigned int streams + if ((encoding == DICTIONARY || encoding == DICTIONARY_V2) && (streamId.getStreamKind() == LENGTH || streamId.getStreamKind() == DATA)) { + return createLongStream(inputStream, encoding, INT, false, usesVInt); + } + + if (streamId.getStreamKind() == DATA) { + switch (type) { + case BOOLEAN: + return new BooleanStream(inputStream); + case BYTE: + return new ByteStream(inputStream); + case SHORT: + case INT: + case LONG: + case DATE: + return createLongStream(inputStream, encoding, type, true, usesVInt); + case FLOAT: + return new FloatStream(inputStream); + case DOUBLE: + return new DoubleStream(inputStream); + case STRING: + case BINARY: + return new ByteArrayStream(inputStream); + case TIMESTAMP: + return createLongStream(inputStream, encoding, type, true, usesVInt); + } + } + + // length stream of a direct encoded string or binary column + if (streamId.getStreamKind() == LENGTH) { + switch (type) { + case STRING: + case BINARY: + case MAP: + case LIST: + return createLongStream(inputStream, encoding, type, false, usesVInt); + } + } + + // length stream of a the row group dictionary + if (streamId.getStreamKind() == ROW_GROUP_DICTIONARY_LENGTH) { + switch (type) { + case STRING: + case BINARY: + return new RowGroupDictionaryLengthStream(inputStream, false); + } + } + + // row group dictionary + if (streamId.getStreamKind() == ROW_GROUP_DICTIONARY) { + switch (type) { + case STRING: + case BINARY: + return new ByteArrayStream(inputStream); + } + } + + // row group dictionary + if (streamId.getStreamKind() == IN_DICTIONARY) { + return new BooleanStream(inputStream); + } + + // length (nanos) of a timestamp column + if (type == TIMESTAMP && streamId.getStreamKind() == SECONDARY) { + return createLongStream(inputStream, encoding, type, false, usesVInt); + } + + if (streamId.getStreamKind() == DICTIONARY_DATA) { + switch (type) { + case SHORT: + case INT: + case LONG: + return createLongStream(inputStream, DWRF_DIRECT, INT, true, usesVInt); + case STRING: + case VARCHAR: + case CHAR: + case BINARY: + return new ByteArrayStream(inputStream); + } + } + + throw new IllegalArgumentException(String.format("Unsupported column type %s for stream %s with encoding %s", type, streamId, encoding)); + } + + private static ValueStream createLongStream( + OrcInputStream inputStream, + ColumnEncodingKind encoding, + OrcTypeKind type, + boolean signed, + boolean usesVInt) + { + if (encoding == DIRECT_V2 || encoding == DICTIONARY_V2) { + return new LongStreamV2(inputStream, signed, false); + } + else if (encoding == DIRECT || encoding == DICTIONARY) { + return new LongStreamV1(inputStream, signed); + } + else if (encoding == DWRF_DIRECT) { + return new LongStreamDwrf(inputStream, type, signed, usesVInt); + } + else { + throw new IllegalArgumentException("Unsupported encoding for long stream: " + encoding); + } + } +} From fac7029139f0f72a6b3e8d040e71e4afc4027fbf Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Mon, 18 May 2015 20:34:45 +0900 Subject: [PATCH 045/141] Sources based JDK 1.7 are applied from Presto --- tajo-storage/tajo-storage-hdfs/pom.xml | 16 +- .../thirdparty/orc/AbstractOrcDataSource.java | 233 ------------- .../storage/thirdparty/orc/BooleanVector.java | 15 +- .../storage/thirdparty/orc/DiskRange.java | 22 -- .../storage/thirdparty/orc/DoubleVector.java | 15 +- .../thirdparty/orc/FileOrcDataSource.java | 88 ++++- .../thirdparty/orc/HdfsOrcDataSource.java | 125 +++++++ .../storage/thirdparty/orc/LongVector.java | 15 +- .../storage/thirdparty/orc/ObjectVector.java | 10 +- .../orc/OrcCorruptionException.java | 10 +- .../storage/thirdparty/orc/OrcDataSource.java | 7 +- .../thirdparty/orc/OrcDataSourceUtils.java | 24 +- .../storage/thirdparty/orc/OrcReader.java | 219 ++++++++++++ .../thirdparty/orc/OrcRecordReader.java | 321 ++++++++++++++++++ .../storage/thirdparty/orc/SliceVector.java | 14 +- .../thirdparty/orc/StreamDescriptor.java | 2 +- .../tajo/storage/thirdparty/orc/StreamId.java | 7 +- .../tajo/storage/thirdparty/orc/Stripe.java | 2 +- .../storage/thirdparty/orc/StripeReader.java | 144 +++----- .../checkpoint/BooleanStreamCheckpoint.java | 2 +- .../checkpoint/ByteArrayStreamCheckpoint.java | 2 +- .../orc/checkpoint/ByteStreamCheckpoint.java | 4 +- .../orc/checkpoint/Checkpoints.java | 29 +- .../checkpoint/DoubleStreamCheckpoint.java | 2 +- .../orc/checkpoint/FloatStreamCheckpoint.java | 2 +- .../orc/checkpoint/InputStreamCheckpoint.java | 2 +- .../checkpoint/LongStreamDwrfCheckpoint.java | 2 +- .../checkpoint/LongStreamV1Checkpoint.java | 4 +- .../checkpoint/LongStreamV2Checkpoint.java | 4 +- ...GroupDictionaryLengthStreamCheckpoint.java | 2 +- .../orc/json/BooleanJsonReader.java | 117 +++++++ .../thirdparty/orc/json/ByteJsonReader.java | 118 +++++++ .../thirdparty/orc/json/DateJsonReader.java | 123 +++++++ .../thirdparty/orc/json/DoubleJsonReader.java | 120 +++++++ .../thirdparty/orc/json/FloatJsonReader.java | 122 +++++++ .../JsonMapKeyReader.java} | 14 +- .../thirdparty/orc/json/JsonReader.java | 36 ++ .../thirdparty/orc/json/JsonReaders.java | 100 ++++++ .../thirdparty/orc/json/ListJsonReader.java | 125 +++++++ .../orc/json/LongDictionaryJsonReader.java | 142 ++++++++ .../orc/json/LongDirectJsonReader.java | 112 ++++++ .../thirdparty/orc/json/LongJsonReader.java | 99 ++++++ .../thirdparty/orc/json/MapJsonReader.java | 138 ++++++++ .../orc/json/SliceDictionaryJsonReader.java | 269 +++++++++++++++ .../orc/json/SliceDirectJsonReader.java | 168 +++++++++ .../thirdparty/orc/json/SliceJsonReader.java | 98 ++++++ .../thirdparty/orc/json/StructJsonReader.java | 117 +++++++ .../orc/json/TimestampJsonReader.java | 134 ++++++++ .../orc/metadata/DwrfMetadataReader.java | 78 ++--- .../orc/metadata/OrcMetadataReader.java | 177 +++------- .../orc/metadata/StringStatistics.java | 14 +- .../orc/reader/BooleanStreamReader.java | 21 +- .../orc/reader/ByteStreamReader.java | 21 +- .../orc/reader/DoubleStreamReader.java | 21 +- .../orc/reader/FloatStreamReader.java | 21 +- .../orc/reader/JsonStreamReader.java | 180 ++++++++++ .../reader/LongDictionaryStreamReader.java | 25 +- .../orc/reader/LongDirectStreamReader.java | 21 +- .../orc/reader/LongStreamReader.java | 2 +- .../thirdparty/orc/reader/OrcReaderUtils.java | 32 -- .../reader/SliceDictionaryStreamReader.java | 47 +-- .../orc/reader/SliceDirectStreamReader.java | 35 +- .../orc/reader/SliceStreamReader.java | 2 +- .../thirdparty/orc/reader/StreamReaders.java | 58 ++++ .../orc/reader/TimestampStreamReader.java | 33 +- .../orc/stream/ByteArrayStream.java | 2 +- .../thirdparty/orc/stream/ByteStream.java | 10 +- .../orc/stream/CheckpointStreamSource.java | 2 +- .../thirdparty/orc/stream/DoubleStream.java | 2 +- .../thirdparty/orc/stream/FloatStream.java | 2 +- .../thirdparty/orc/stream/LongDecode.java | 7 +- .../thirdparty/orc/stream/LongStreamDwrf.java | 4 +- .../thirdparty/orc/stream/LongStreamV1.java | 12 +- .../thirdparty/orc/stream/LongStreamV2.java | 12 +- .../thirdparty/orc/stream/OrcInputStream.java | 121 +++---- .../thirdparty/orc/stream/OrcStreamUtils.java | 11 +- .../thirdparty/orc/stream/StreamSources.java | 4 +- .../orc/stream/ValueStreamSource.java | 53 --- 78 files changed, 3505 insertions(+), 1021 deletions(-) delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/AbstractOrcDataSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java rename tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/{checkpoint/InvalidCheckpointException.java => json/JsonMapKeyReader.java} (65%) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/OrcReaderUtils.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreamSource.java diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 96da422964..efe7b210e3 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -357,12 +357,12 @@ io.airlift slice - 0.10 + 0.7 io.airlift units - 0.108 + 0.97 com.google.guava @@ -382,7 +382,17 @@ com.facebook.presto.hive hive-apache - 0.10 + 0.9 + + + org.jetbrains + annotations + 13.0 + + + com.fasterxml.jackson.core + jackson-core + 2.4.2 com.facebook.hive diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/AbstractOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/AbstractOrcDataSource.java deleted file mode 100644 index e726870b29..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/AbstractOrcDataSource.java +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.collect.ImmutableMap; -import com.google.common.primitives.Ints; -import io.airlift.slice.*; -import io.airlift.slice.ChunkedSliceInput.BufferReference; -import io.airlift.slice.ChunkedSliceInput.SliceLoader; -import io.airlift.units.DataSize; - -import java.io.IOException; -import java.util.LinkedHashMap; -import java.util.Map; -import java.util.Map.Entry; - -import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.getDiskRangeSlice; -import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; - -public abstract class AbstractOrcDataSource - implements OrcDataSource -{ - private final String name; - private final long size; - private final DataSize maxMergeDistance; - private final DataSize maxBufferSize; - private final DataSize streamBufferSize; - private long readTimeNanos; - - public AbstractOrcDataSource(String name, long size, DataSize maxMergeDistance, DataSize maxBufferSize, DataSize streamBufferSize) - { - this.name = checkNotNull(name, "name is null"); - - this.size = size; - checkArgument(size >= 0, "size is negative"); - - this.maxMergeDistance = checkNotNull(maxMergeDistance, "maxMergeDistance is null"); - this.maxBufferSize = checkNotNull(maxBufferSize, "maxBufferSize is null"); - this.streamBufferSize = checkNotNull(streamBufferSize, "streamBufferSize is null"); - } - - protected abstract void readInternal(long position, byte[] buffer, int bufferOffset, int bufferLength) - throws IOException; - - @Override - public final long getReadTimeNanos() - { - return readTimeNanos; - } - - @Override - public final long getSize() - { - return size; - } - - @Override - public final void readFully(long position, byte[] buffer) - throws IOException - { - readFully(position, buffer, 0, buffer.length); - } - - @Override - public final void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) - throws IOException - { - long start = System.nanoTime(); - - readInternal(position, buffer, bufferOffset, bufferLength); - - readTimeNanos += System.nanoTime() - start; - } - - @Override - public final Map readFully(Map diskRanges) - throws IOException - { - checkNotNull(diskRanges, "diskRanges is null"); - - if (diskRanges.isEmpty()) { - return ImmutableMap.of(); - } - - // - // Note: this code does not use the Java 8 stream APIs to avoid any extra object allocation - // - - // split disk ranges into "big" and "small" - long maxReadSizeBytes = maxBufferSize.toBytes(); - ImmutableMap.Builder smallRangesBuilder = ImmutableMap.builder(); - ImmutableMap.Builder largeRangesBuilder = ImmutableMap.builder(); - for (Entry entry : diskRanges.entrySet()) { - if (entry.getValue().getLength() <= maxReadSizeBytes) { - smallRangesBuilder.put(entry); - } - else { - largeRangesBuilder.put(entry); - } - } - Map smallRanges = smallRangesBuilder.build(); - Map largeRanges = largeRangesBuilder.build(); - - // read ranges - ImmutableMap.Builder slices = ImmutableMap.builder(); - slices.putAll(readSmallDiskRanges(smallRanges)); - slices.putAll(readLargeDiskRanges(largeRanges)); - - return slices.build(); - } - - private Map readSmallDiskRanges(Map diskRanges) - throws IOException - { - if (diskRanges.isEmpty()) { - return ImmutableMap.of(); - } - - Iterable mergedRanges = mergeAdjacentDiskRanges(diskRanges.values(), maxMergeDistance, maxBufferSize); - - // read ranges - Map buffers = new LinkedHashMap(); - for (DiskRange mergedRange : mergedRanges) { - // read full range in one request - byte[] buffer = new byte[mergedRange.getLength()]; - readFully(mergedRange.getOffset(), buffer); - buffers.put(mergedRange, buffer); - } - - ImmutableMap.Builder slices = ImmutableMap.builder(); - for (Entry entry : diskRanges.entrySet()) { - slices.put(entry.getKey(), getDiskRangeSlice(entry.getValue(), buffers).getInput()); - } - return slices.build(); - } - - private Map readLargeDiskRanges(Map diskRanges) - throws IOException - { - if (diskRanges.isEmpty()) { - return ImmutableMap.of(); - } - - ImmutableMap.Builder slices = ImmutableMap.builder(); - for (Entry entry : diskRanges.entrySet()) { - ChunkedSliceInput sliceInput = new ChunkedSliceInput(new HdfsSliceLoader(entry.getValue()), Ints.checkedCast(streamBufferSize.toBytes())); - slices.put(entry.getKey(), sliceInput); - } - return slices.build(); - } - - @Override - public final String toString() - { - return name; - } - - private class HdfsSliceLoader - implements SliceLoader - { - private final DiskRange diskRange; - - public HdfsSliceLoader(DiskRange diskRange) - { - this.diskRange = diskRange; - } - - @Override - public SliceBufferReference createBuffer(int bufferSize) - { - return new SliceBufferReference(bufferSize); - } - - @Override - public long getSize() - { - return diskRange.getLength(); - } - - @Override - public void load(long position, SliceBufferReference bufferReference, int length) - { - try { - readFully(diskRange.getOffset() + position, bufferReference.getBuffer(), 0, length); - } - catch (IOException e) { - new RuntimeIOException(e); - } - } - - @Override - public void close() - { - } - } - - private static class SliceBufferReference - implements BufferReference - { - private final byte[] buffer; - private final Slice slice; - - public SliceBufferReference(int bufferSize) - { - this.buffer = new byte[bufferSize]; - this.slice = Slices.wrappedBuffer(buffer); - } - - public byte[] getBuffer() - { - return buffer; - } - - @Override - public Slice getSlice() - { - return slice; - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java index ae62d407ac..aaa1ada35c 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java @@ -18,23 +18,14 @@ public class BooleanVector implements Vector { - public final boolean[] isNull; - public final boolean[] vector; - - public BooleanVector(int length) - { - if (length > MAX_VECTOR_LENGTH) { - throw new IllegalArgumentException("length greater than max vector length"); - } - isNull = new boolean[length]; - vector = new boolean[length]; - } + public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; + public final boolean[] vector = new boolean[MAX_VECTOR_LENGTH]; @Override @VisibleForTesting public ObjectVector toObjectVector(int size) { - ObjectVector objectVector = new ObjectVector(vector.length); + ObjectVector objectVector = new ObjectVector(); for (int i = 0; i < size; i++) { if (!isNull[i]) { objectVector.vector[i] = vector[i]; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java index fdd47556ce..8a3f249c3f 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java @@ -15,8 +15,6 @@ import com.google.common.primitives.Ints; -import java.util.Objects; - import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; @@ -68,26 +66,6 @@ public DiskRange span(DiskRange otherDiskRange) return new DiskRange(start, Ints.checkedCast(end - start)); } - @Override - public int hashCode() - { - return Objects.hash(offset, length); - } - - @Override - public boolean equals(Object obj) - { - if (this == obj) { - return true; - } - if (obj == null || getClass() != obj.getClass()) { - return false; - } - DiskRange other = (DiskRange) obj; - return Objects.equals(this.offset, other.offset) - && Objects.equals(this.length, other.length); - } - @Override public String toString() { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java index ba40c493b4..8f20d29590 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java @@ -18,23 +18,14 @@ public class DoubleVector implements Vector { - public final boolean[] isNull; - public final double[] vector; - - public DoubleVector(int length) - { - if (length > MAX_VECTOR_LENGTH) { - throw new IllegalArgumentException("length greater than max vector length"); - } - vector = new double[length]; - isNull = new boolean[length]; - } + public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; + public final double[] vector = new double[MAX_VECTOR_LENGTH]; @Override @VisibleForTesting public ObjectVector toObjectVector(int size) { - ObjectVector objectVector = new ObjectVector(vector.length); + ObjectVector objectVector = new ObjectVector(); for (int i = 0; i < size; i++) { if (!isNull[i]) { objectVector.vector[i] = vector[i]; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java index 5325d4b48f..3d0c42eb89 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java @@ -13,23 +13,38 @@ */ package org.apache.tajo.storage.thirdparty.orc; +import com.google.common.collect.ImmutableMap; +import io.airlift.slice.Slice; import io.airlift.units.DataSize; import java.io.File; -import java.io.FileNotFoundException; import java.io.IOException; import java.io.RandomAccessFile; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Map.Entry; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.getDiskRangeSlice; +import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; public class FileOrcDataSource - extends AbstractOrcDataSource + implements OrcDataSource { + private final File path; + private final long size; private final RandomAccessFile input; + private final DataSize maxMergeDistance; + private long readTimeNanos; - public FileOrcDataSource(File path, DataSize maxMergeDistance, DataSize maxReadSize, DataSize streamBufferSize) - throws FileNotFoundException + public FileOrcDataSource(File path, DataSize maxMergeDistance) + throws IOException { - super(path.getPath(), path.length(), maxMergeDistance, maxReadSize, streamBufferSize); + this.path = checkNotNull(path, "path is null"); + this.size = path.length(); this.input = new RandomAccessFile(path, "r"); + + this.maxMergeDistance = checkNotNull(maxMergeDistance, "maxMergeDistance is null"); } @Override @@ -40,10 +55,71 @@ public void close() } @Override - protected void readInternal(long position, byte[] buffer, int bufferOffset, int bufferLength) + public long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public long getSize() + { + return size; + } + + @Override + public void readFully(long position, byte[] buffer) + throws IOException + { + readFully(position, buffer, 0, buffer.length); + } + + @Override + public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) throws IOException { + long start = System.nanoTime(); + input.seek(position); input.readFully(buffer, bufferOffset, bufferLength); + + readTimeNanos += System.nanoTime() - start; + } + + @Override + public Map readFully(Map diskRanges) + throws IOException + { + checkNotNull(diskRanges, "diskRanges is null"); + + if (diskRanges.isEmpty()) { + return ImmutableMap.of(); + } + + // TODO: benchmark alternatively strategies: + // 1) sort ranges and perform one read per range + // 2) single read with transferTo() using custom WritableByteChannel + + Iterable mergedRanges = mergeAdjacentDiskRanges(diskRanges.values(), maxMergeDistance); + + // read ranges + Map buffers = new LinkedHashMap(); + for (DiskRange mergedRange : mergedRanges) { + // read full range in one request + byte[] buffer = new byte[mergedRange.getLength()]; + readFully(mergedRange.getOffset(), buffer); + buffers.put(mergedRange, buffer); + } + + ImmutableMap.Builder slices = ImmutableMap.builder(); + for (Entry entry : diskRanges.entrySet()) { + slices.put(entry.getKey(), getDiskRangeSlice(entry.getValue(), buffers)); + } + return slices.build(); + } + + @Override + public String toString() + { + return path.getPath(); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java new file mode 100644 index 0000000000..a373c27581 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java @@ -0,0 +1,125 @@ + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.tajo.storage.thirdparty.orc.DiskRange; +import org.apache.tajo.storage.thirdparty.orc.OrcDataSource; +import com.google.common.collect.ImmutableMap; +import io.airlift.slice.Slice; +import io.airlift.units.DataSize; +import org.apache.hadoop.fs.FSDataInputStream; + +import java.io.IOException; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Map.Entry; + +import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.getDiskRangeSlice; +import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +public class HdfsOrcDataSource + implements OrcDataSource +{ + private final FSDataInputStream inputStream; + private final String path; + private final long size; + private final DataSize maxMergeDistance; + private long readTimeNanos; + + public HdfsOrcDataSource(String path, FSDataInputStream inputStream, long size, DataSize maxMergeDistance) + { + this.path = checkNotNull(path, "path is null"); + this.inputStream = checkNotNull(inputStream, "inputStream is null"); + this.size = size; + checkArgument(size >= 0, "size is negative"); + + this.maxMergeDistance = checkNotNull(maxMergeDistance, "maxMergeDistance is null"); + } + + @Override + public void close() + throws IOException + { + inputStream.close(); + } + + @Override + public long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public long getSize() + { + return size; + } + + @Override + public void readFully(long position, byte[] buffer) + throws IOException + { + readFully(position, buffer, 0, buffer.length); + } + + @Override + public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException + { + long start = System.nanoTime(); + + inputStream.readFully(position, buffer, bufferOffset, bufferLength); + + readTimeNanos += System.nanoTime() - start; + } + + @Override + public Map readFully(Map diskRanges) + throws IOException + { + checkNotNull(diskRanges, "diskRanges is null"); + + if (diskRanges.isEmpty()) { + return ImmutableMap.of(); + } + + Iterable mergedRanges = mergeAdjacentDiskRanges(diskRanges.values(), maxMergeDistance); + + // read ranges + Map buffers = new LinkedHashMap(); + for (DiskRange mergedRange : mergedRanges) { + // read full range in one request + byte[] buffer = new byte[mergedRange.getLength()]; + readFully(mergedRange.getOffset(), buffer); + buffers.put(mergedRange, buffer); + } + + ImmutableMap.Builder slices = ImmutableMap.builder(); + for (Entry entry : diskRanges.entrySet()) { + slices.put(entry.getKey(), getDiskRangeSlice(entry.getValue(), buffers)); + } + return slices.build(); + } + + @Override + public String toString() + { + return path; + } +} + + diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java index 1c4834eca1..7c9407a3e6 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java @@ -18,23 +18,14 @@ public class LongVector implements Vector { - public final boolean[] isNull; - public final long[] vector; - - public LongVector(int length) - { - if (length > MAX_VECTOR_LENGTH) { - throw new IllegalArgumentException("length greater than max vector length"); - } - vector = new long[length]; - isNull = new boolean[length]; - } + public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; + public final long[] vector = new long[MAX_VECTOR_LENGTH]; @Override @VisibleForTesting public ObjectVector toObjectVector(int size) { - ObjectVector objectVector = new ObjectVector(vector.length); + ObjectVector objectVector = new ObjectVector(); for (int i = 0; i < size; i++) { if (!isNull[i]) { objectVector.vector[i] = vector[i]; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java index 7419ebb8d7..19f9608f7d 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java @@ -18,15 +18,7 @@ public class ObjectVector implements Vector { - public final Object[] vector; - - public ObjectVector(int length) - { - if (length > MAX_VECTOR_LENGTH) { - throw new IllegalArgumentException("length greater than max vector length"); - } - vector = new Object[length]; - } + public final Object[] vector = new Object[MAX_VECTOR_LENGTH]; @Override @VisibleForTesting diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java index db2f6b6b5f..c780bcb51f 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java @@ -13,6 +13,8 @@ */ package org.apache.tajo.storage.thirdparty.orc; +import org.jetbrains.annotations.Contract; + import java.io.IOException; import static java.lang.String.format; @@ -20,9 +22,13 @@ public class OrcCorruptionException extends IOException { - public OrcCorruptionException(String message) + @Contract("false, _, _ -> fail") + public static void verifyFormat(boolean test, String messageFormat, Object... args) + throws OrcCorruptionException { - super(message); + if (!test) { + throw new OrcCorruptionException(messageFormat, args); + } } public OrcCorruptionException(String messageFormat, Object... args) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java index c577ccb5f7..8eb1cbdd00 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java @@ -13,7 +13,7 @@ */ package org.apache.tajo.storage.thirdparty.orc; -import io.airlift.slice.FixedLengthSliceInput; +import io.airlift.slice.Slice; import java.io.Closeable; import java.io.IOException; @@ -32,9 +32,6 @@ void readFully(long position, byte[] buffer) void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) throws IOException; - Map readFully(Map diskRanges) + Map readFully(Map diskRanges) throws IOException; - - @Override - void close() throws IOException; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java index 600a1425fa..ba65c3c55c 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java @@ -13,16 +13,12 @@ */ package org.apache.tajo.storage.thirdparty.orc; -import com.google.common.collect.ImmutableList; import com.google.common.primitives.Ints; import io.airlift.slice.Slice; import io.airlift.slice.Slices; import io.airlift.units.DataSize; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.Map.Entry; import static com.google.common.collect.Lists.newArrayList; @@ -36,29 +32,25 @@ private OrcDataSourceUtils() /** * Merge disk ranges that are closer than {@code maxMergeDistance}. */ - public static List mergeAdjacentDiskRanges(Iterable diskRanges, DataSize maxMergeDistance, DataSize maxReadSize) + public static Iterable mergeAdjacentDiskRanges(Iterable diskRanges, DataSize maxMergeDistance) { // sort ranges by start offset List ranges = newArrayList(diskRanges); - Collections.sort(ranges, new Comparator() - { + Collections.sort(ranges, new Comparator() { @Override - public int compare(DiskRange o1, DiskRange o2) - { + public int compare(DiskRange o1, DiskRange o2) { return Long.compare(o1.getOffset(), o2.getOffset()); } }); // merge overlapping ranges - long maxReadSizeBytes = maxReadSize.toBytes(); long maxMergeDistanceBytes = maxMergeDistance.toBytes(); - ImmutableList.Builder result = ImmutableList.builder(); + List result = new ArrayList(); DiskRange last = ranges.get(0); for (int i = 1; i < ranges.size(); i++) { DiskRange current = ranges.get(i); - DiskRange merged = last.span(current); - if (merged.getLength() <= maxReadSizeBytes && last.getEnd() + maxMergeDistanceBytes >= current.getOffset()) { - last = merged; + if (last.getEnd() + maxMergeDistanceBytes + 1 >= current.getOffset()) { + last = last.span(current); } else { result.add(last); @@ -67,7 +59,7 @@ public int compare(DiskRange o1, DiskRange o2) } result.add(last); - return result.build(); + return result; } /** diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java new file mode 100644 index 0000000000..144baa5e7b --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java @@ -0,0 +1,219 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.base.Joiner; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.metadata.*; +import org.apache.tajo.storage.thirdparty.orc.stream.OrcInputStream; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.joda.time.DateTimeZone; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static io.airlift.slice.SizeOf.SIZE_OF_BYTE; + +public class OrcReader +{ + private static final Slice MAGIC = Slices.utf8Slice("ORC"); + private static final int CURRENT_MAJOR_VERSION = 0; + private static final int CURRENT_MINOR_VERSION = 12; + private static final int EXPECTED_FOOTER_SIZE = 16 * 1024; + + private final OrcDataSource orcDataSource; + private final MetadataReader metadataReader; + private final CompressionKind compressionKind; + private final int bufferSize; + private final Footer footer; + private final Metadata metadata; + + // This is based on the Apache Hive ORC code + public OrcReader(OrcDataSource orcDataSource, MetadataReader metadataReader) + throws IOException + { + this.orcDataSource = checkNotNull(orcDataSource, "orcDataSource is null"); + this.metadataReader = checkNotNull(metadataReader, "metadataReader is null"); + + // + // Read the file tail: + // + // variable: Footer + // variable: Metadata + // variable: PostScript - contains length of footer and metadata + // 3 bytes: file magic "ORC" + // 1 byte: postScriptSize = PostScript + Magic + + // figure out the size of the file using the option or filesystem + long size = orcDataSource.getSize(); + + // Read the tail of the file + byte[] buffer = new byte[(int) Math.min(size, EXPECTED_FOOTER_SIZE)]; + orcDataSource.readFully(size - buffer.length, buffer); + + // get length of PostScript - last byte of the file + int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff; + + // make sure this is an ORC file and not an RCFile or something else + verifyOrcFooter(orcDataSource, postScriptSize, buffer); + + // decode the post script + int postScriptOffset = buffer.length - SIZE_OF_BYTE - postScriptSize; + PostScript postScript = metadataReader.readPostScript(buffer, postScriptOffset, postScriptSize); + + // verify this is a supported version + checkOrcVersion(orcDataSource, postScript.getVersion()); + + // check compression codec is supported + this.compressionKind = postScript.getCompression(); + + this.bufferSize = Ints.checkedCast(postScript.getCompressionBlockSize()); + + int footerSize = Ints.checkedCast(postScript.getFooterLength()); + int metadataSize = Ints.checkedCast(postScript.getMetadataLength()); + + // check if extra bytes need to be read + Slice completeFooterSlice; + int completeFooterSize = footerSize + metadataSize + postScriptSize + SIZE_OF_BYTE; + if (completeFooterSize > buffer.length) { + // allocate a new buffer large enough for the complete footer + byte[] newBuffer = new byte[completeFooterSize]; + completeFooterSlice = Slices.wrappedBuffer(newBuffer); + + // initial read was not large enough, so read missing section + orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length); + + // copy already read bytes into the new buffer + completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer); + } + else { + // footer is already in the bytes in buffer, just adjust position, length + completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize); + } + + // read metadata + Slice metadataSlice = completeFooterSlice.slice(0, metadataSize); + InputStream metadataInputStream = new OrcInputStream(orcDataSource.toString(), metadataSlice.getInput(), compressionKind, bufferSize); + this.metadata = metadataReader.readMetadata(metadataInputStream); + + // read footer + Slice footerSlice = completeFooterSlice.slice(metadataSize, footerSize); + InputStream footerInputStream = new OrcInputStream(orcDataSource.toString(), footerSlice.getInput(), compressionKind, bufferSize); + this.footer = metadataReader.readFooter(footerInputStream); + } + + public List getColumnNames() + { + return footer.getTypes().get(0).getFieldNames(); + } + + public Footer getFooter() + { + return footer; + } + + public Metadata getMetadata() + { + return metadata; + } + + public CompressionKind getCompressionKind() + { + return compressionKind; + } + + public int getBufferSize() + { + return bufferSize; + } + + public OrcRecordReader createRecordReader( + Set includedColumns, + OrcPredicate predicate, + long offset, + long length, + DateTimeZone hiveStorageTimeZone) + throws IOException + { + return new OrcRecordReader( + checkNotNull(includedColumns, "includedColumns is null"), + checkNotNull(predicate, "predicate is null"), + footer.getNumberOfRows(), + footer.getStripes(), + footer.getFileStats(), + metadata.getStripeStatsList(), + orcDataSource, + offset, + length, + footer.getTypes(), + compressionKind, + bufferSize, + footer.getRowsInRowGroup(), + checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"), + metadataReader); + } + + /** + * Verify this is an ORC file to prevent users from trying to read text + * files or RC files as ORC files. + */ + // This is based on the Apache Hive ORC code + private static void verifyOrcFooter( + OrcDataSource source, + int postScriptSize, + byte[] buffer) + throws IOException + { + int magicLength = MAGIC.length(); + checkArgument(postScriptSize >= magicLength + 1, "Malformed ORC file %s. Invalid postscript length %s", source, postScriptSize); + + if (!MAGIC.equals(Slices.wrappedBuffer(buffer, buffer.length - 1 - magicLength, magicLength))) { + // Old versions of ORC (0.11) wrote the magic to the head of the file + byte[] headerMagic = new byte[magicLength]; + source.readFully(0, headerMagic); + + // if it isn't there, this isn't an ORC file + checkArgument(MAGIC.equals(Slices.wrappedBuffer(headerMagic)), "Malformed ORC file %s. Invalid postscript.", source); + } + } + + /** + * Check to see if this ORC file is from a future version and if so, + * warn the user that we may not be able to read all of the column encodings. + */ + // This is based on the Apache Hive ORC code + private static void checkOrcVersion(OrcDataSource orcDataSource, List version) + { + if (version.size() >= 1) { + int major = version.get(0); + int minor = 0; + if (version.size() > 1) { + minor = version.get(1); + } + + if (major > CURRENT_MAJOR_VERSION || (major == CURRENT_MAJOR_VERSION && minor > CURRENT_MINOR_VERSION)) { + System.err.println(String.format("ORC file %s was written by a newer Hive version %s. This file may not be readable by this version of Hive (%s.%s).", + orcDataSource, + Joiner.on('.').join(version), + CURRENT_MAJOR_VERSION, + CURRENT_MINOR_VERSION)); + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java new file mode 100644 index 0000000000..9f0e78300d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java @@ -0,0 +1,321 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.metadata.*; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.reader.StreamReader; +import org.apache.tajo.storage.thirdparty.orc.reader.StreamReaders; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTimeZone; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +public class OrcRecordReader +{ + private final OrcDataSource orcDataSource; + + private final StreamReader[] streamReaders; + + private final long totalRowCount; + private final long splitLength; + private final Set presentColumns; + private long currentPosition; + + private final List stripes; + private final StripeReader stripeReader; + private int currentStripe = -1; + + private Iterator rowGroups = ImmutableList.of().iterator(); + private long currentGroupRowCount; + private long nextRowInGroup; + + public OrcRecordReader( + Set includedColumns, + OrcPredicate predicate, + long numberOfRows, + List fileStripes, + List fileStats, + List stripeStats, + OrcDataSource orcDataSource, + long splitOffset, + long splitLength, + List types, + CompressionKind compressionKind, + int bufferSize, + int rowsInRowGroup, + DateTimeZone hiveStorageTimeZone, + MetadataReader metadataReader) + throws IOException + { + checkNotNull(includedColumns, "includedColumns is null"); + checkNotNull(predicate, "predicate is null"); + checkNotNull(fileStripes, "fileStripes is null"); + checkNotNull(stripeStats, "stripeStats is null"); + checkNotNull(orcDataSource, "orcDataSource is null"); + checkNotNull(types, "types is null"); + checkNotNull(compressionKind, "compressionKind is null"); + checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"); + + // reduce the included columns to the set that is also present + ImmutableSet.Builder presentColumns = ImmutableSet.builder(); + OrcType root = types.get(0); + for (int includedColumn : includedColumns) { + // an old file can have less columns since columns can be added + // after the file was written + if (includedColumn < root.getFieldCount()) { + presentColumns.add(includedColumn); + } + } + this.presentColumns = presentColumns.build(); + + this.orcDataSource = orcDataSource; + this.splitLength = splitLength; + + // it is possible that old versions of orc use 0 to mean there are no row groups + checkArgument(rowsInRowGroup > 0, "rowsInRowGroup must be greater than zero"); + + long totalRowCount = 0; + ImmutableList.Builder stripes = ImmutableList.builder(); + if (predicate.matches(numberOfRows, getStatisticsByColumnOrdinal(root, fileStats))) { + // select stripes that start within the specified split + for (int stripeIndex = 0; stripeIndex < fileStripes.size(); stripeIndex++) { + StripeInformation stripe = fileStripes.get(stripeIndex); + if (splitContainsStripe(splitOffset, splitLength, stripe) && isStripeIncluded(root, stripe, stripeStats, predicate, stripeIndex)) { + stripes.add(stripe); + totalRowCount += stripe.getNumberOfRows(); + } + } + } + this.totalRowCount = totalRowCount; + this.stripes = stripes.build(); + + stripeReader = new StripeReader( + orcDataSource, + compressionKind, + types, + bufferSize, + this.presentColumns, + rowsInRowGroup, + predicate, + metadataReader); + + streamReaders = createStreamReaders(orcDataSource, types, hiveStorageTimeZone, this.presentColumns); + } + + private static boolean splitContainsStripe(long splitOffset, long splitLength, StripeInformation stripe) + { + long splitEndOffset = splitOffset + splitLength; + return splitOffset <= stripe.getOffset() && stripe.getOffset() < splitEndOffset; + } + + private static boolean isStripeIncluded( + OrcType rootStructType, + StripeInformation stripe, + List stripeStats, + OrcPredicate predicate, + int stripeIndex) + { + // if there are no stats, include the column + if (stripeIndex >= stripeStats.size()) { + return true; + } + + return predicate.matches(stripe.getNumberOfRows(), getStatisticsByColumnOrdinal(rootStructType, stripeStats.get(stripeIndex).getColumnStatistics())); + } + + public long getPosition() + { + return currentPosition; + } + + public long getTotalRowCount() + { + return totalRowCount; + } + + public float getProgress() + { + return ((float) currentPosition) / totalRowCount; + } + + public long getSplitLength() + { + return splitLength; + } + + public void close() + throws IOException + { + orcDataSource.close(); + } + + public boolean isColumnPresent(int hiveColumnIndex) + { + return presentColumns.contains(hiveColumnIndex); + } + + public int nextBatch() + throws IOException + { + // if next row is within the current group return + if (nextRowInGroup >= currentGroupRowCount) { + // attempt to advance to next row group + if (!advanceToNextRowGroup()) { + return -1; + } + } + + int batchSize = Ints.checkedCast(Math.min(Vector.MAX_VECTOR_LENGTH, currentGroupRowCount - nextRowInGroup)); + + for (StreamReader column : streamReaders) { + if (column != null) { + column.prepareNextRead(batchSize); + } + } + nextRowInGroup += batchSize; + currentPosition += batchSize; + return batchSize; + } + + public void readVector(int columnIndex, Object vector) + throws IOException + { + streamReaders[columnIndex].readBatch(vector); + } + + private boolean advanceToNextRowGroup() + throws IOException + { + nextRowInGroup = 0; + + while (!rowGroups.hasNext() && currentStripe < stripes.size()) { + advanceToNextStripe(); + } + + if (!rowGroups.hasNext()) { + currentGroupRowCount = 0; + return false; + } + + RowGroup currentRowGroup = rowGroups.next(); + currentGroupRowCount = currentRowGroup.getRowCount(); + + // give reader data streams from row group + StreamSources rowGroupStreamSources = currentRowGroup.getStreamSources(); + for (StreamReader column : streamReaders) { + if (column != null) { + column.startRowGroup(rowGroupStreamSources); + } + } + + return true; + } + + private void advanceToNextStripe() + throws IOException + { + currentStripe++; + if (currentStripe >= stripes.size()) { + return; + } + + StripeInformation stripeInformation = stripes.get(currentStripe); + Stripe stripe = stripeReader.readStripe(stripeInformation); + if (stripe != null) { + // Give readers access to dictionary streams + StreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources(); + List columnEncodings = stripe.getColumnEncodings(); + for (StreamReader column : streamReaders) { + if (column != null) { + column.startStripe(dictionaryStreamSources, columnEncodings); + } + } + + rowGroups = stripe.getRowGroups().iterator(); + } + else { + rowGroups = ImmutableList.of().iterator(); + } + } + + private static StreamReader[] createStreamReaders(OrcDataSource orcDataSource, + List types, + DateTimeZone hiveStorageTimeZone, + Set includedColumns) + { + List streamDescriptors = createStreamDescriptor("", "", 0, types, orcDataSource).getNestedStreams(); + + OrcType rowType = types.get(0); + StreamReader[] streamReaders = new StreamReader[rowType.getFieldCount()]; + for (int columnId = 0; columnId < rowType.getFieldCount(); columnId++) { + if (includedColumns.contains(columnId)) { + StreamDescriptor streamDescriptor = streamDescriptors.get(columnId); + streamReaders[columnId] = StreamReaders.createStreamReader(streamDescriptor, hiveStorageTimeZone); + } + } + return streamReaders; + } + + private static StreamDescriptor createStreamDescriptor(String parentStreamName, String fieldName, int typeId, List types, OrcDataSource dataSource) + { + OrcType type = types.get(typeId); + + if (!fieldName.isEmpty()) { + parentStreamName += "." + fieldName; + } + + ImmutableList.Builder nestedStreams = ImmutableList.builder(); + if (type.getOrcTypeKind() == OrcTypeKind.STRUCT) { + for (int i = 0; i < type.getFieldCount(); ++i) { + nestedStreams.add(createStreamDescriptor(parentStreamName, type.getFieldName(i), type.getFieldTypeIndex(i), types, dataSource)); + } + } + else if (type.getOrcTypeKind() == OrcTypeKind.LIST) { + nestedStreams.add(createStreamDescriptor(parentStreamName, "item", type.getFieldTypeIndex(0), types, dataSource)); + } + else if (type.getOrcTypeKind() == OrcTypeKind.MAP) { + nestedStreams.add(createStreamDescriptor(parentStreamName, "key", type.getFieldTypeIndex(0), types, dataSource)); + nestedStreams.add(createStreamDescriptor(parentStreamName, "value", type.getFieldTypeIndex(1), types, dataSource)); + } + return new StreamDescriptor(parentStreamName, typeId, fieldName, type.getOrcTypeKind(), dataSource, nestedStreams.build()); + } + + private static Map getStatisticsByColumnOrdinal(OrcType rootStructType, List fileStats) + { + checkNotNull(rootStructType, "rootStructType is null"); + checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); + checkNotNull(fileStats, "fileStats is null"); + + ImmutableMap.Builder statistics = ImmutableMap.builder(); + for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { + ColumnStatistics element = fileStats.get(rootStructType.getFieldTypeIndex(ordinal)); + if (element != null) { + statistics.put(ordinal, element); + } + } + return statistics.build(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java index e1a696fdc5..01cfbfca80 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java @@ -19,24 +19,16 @@ public class SliceVector implements Vector { - public final Slice[] vector; - - public SliceVector(int length) - { - if (length > MAX_VECTOR_LENGTH) { - throw new IllegalArgumentException("length greater than max vector length"); - } - vector = new Slice[length]; - } + public final Slice[] vector = new Slice[MAX_VECTOR_LENGTH]; @Override @VisibleForTesting public ObjectVector toObjectVector(int size) { - ObjectVector objectVector = new ObjectVector(vector.length); + ObjectVector objectVector = new ObjectVector(); for (int i = 0; i < size; i++) { if (vector[i] != null) { - objectVector.vector[i] = vector[i]; + objectVector.vector[i] = vector[i].toStringUtf8(); } } return objectVector; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java index 9a5b53b464..a8108e6f36 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java @@ -13,8 +13,8 @@ */ package org.apache.tajo.storage.thirdparty.orc; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; import com.google.common.collect.ImmutableList; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; import java.util.List; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java index 08afe28fd1..3cec23c247 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java @@ -16,6 +16,8 @@ import org.apache.tajo.storage.thirdparty.orc.metadata.Stream; import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; +import java.util.Objects; + import static com.google.common.base.MoreObjects.toStringHelper; public final class StreamId @@ -48,7 +50,7 @@ public StreamKind getStreamKind() @Override public int hashCode() { - return 31 * column + streamKind.hashCode(); + return Objects.hash(column, streamKind); } @Override @@ -60,9 +62,8 @@ public boolean equals(Object obj) if (obj == null || getClass() != obj.getClass()) { return false; } - StreamId other = (StreamId) obj; - return column == other.column && streamKind == other.streamKind; + return Objects.equals(this.column, other.column) && Objects.equals(this.streamKind, other.streamKind); } @Override diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java index 54aa513a35..a95353160e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java @@ -13,9 +13,9 @@ */ package org.apache.tajo.storage.thirdparty.orc; +import com.google.common.collect.ImmutableList; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import com.google.common.collect.ImmutableList; import java.util.List; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java index 6cc26c666b..1e4c4bc273 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java @@ -13,20 +13,19 @@ */ package org.apache.tajo.storage.thirdparty.orc; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.InvalidCheckpointException; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.metadata.*; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; -import org.apache.tajo.storage.thirdparty.orc.stream.*; +import com.google.common.base.Function; import com.google.common.base.Predicates; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; import com.google.common.primitives.Ints; -import io.airlift.slice.FixedLengthSliceInput; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.metadata.*; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.stream.*; +import io.airlift.slice.Slice; import io.airlift.slice.Slices; import java.io.IOException; @@ -34,14 +33,14 @@ import java.util.*; import java.util.Map.Entry; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.getStreamCheckpoints; import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY; import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; import static org.apache.tajo.storage.thirdparty.orc.stream.CheckpointStreamSource.createCheckpointStreamSource; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; public class StripeReader { @@ -82,76 +81,29 @@ public Stripe readStripe(StripeInformation stripe) // get streams for selected columns Map streams = new HashMap(); - boolean hasRowGroupDictionary = false; for (Stream stream : stripeFooter.getStreams()) { if (includedOrcColumns.contains(stream.getColumn())) { streams.put(new StreamId(stream), stream); - - ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind(); - if (columnEncoding == DICTIONARY && stream.getStreamKind() == StreamKind.IN_DICTIONARY) { - hasRowGroupDictionary = true; - } } } - if (stripe.getNumberOfRows() > 10000 || hasRowGroupDictionary) { - // determine ranges of the stripe to read - Map diskRanges = getDiskRanges(stripeFooter.getStreams()); - diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet())); - - // read the file regions - Map streamsData = readDiskRanges(stripe.getOffset(), diskRanges); - - // read the row index for each column - Map> columnIndexes = readColumnIndexes(streams, streamsData); - - // select the row groups matching the tuple domain - Set selectedRowGroups = selectRowGroups(stripe, columnIndexes); - - // if all row groups are skipped, return null - if (selectedRowGroups.isEmpty()) { - return null; - } - - // value streams - Map> valueStreams = createValueStreams(streams, streamsData, columnEncodings); + // determine ranges of the stripe to read + Map diskRanges = getDiskRanges(stripeFooter.getStreams()); + diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet())); - // build the dictionary streams - StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings); + // read the file regions + Map streamsData = readDiskRanges(stripe.getOffset(), diskRanges); - // build the row groups - try { - List rowGroups = createRowGroups( - stripe.getNumberOfRows(), - streams, - valueStreams, - columnIndexes, - selectedRowGroups, - columnEncodings); + // read the row index for each column + Map> columnIndexes = readColumnIndexes(streams, streamsData); - return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources); - } - catch (InvalidCheckpointException e) { - // The ORC file contains a corrupt checkpoint stream - // If the file does not have a row group dictionary, treat the stripe as a single row group. Otherwise, - // we must fail because the length of the row group dictionary is contained in the checkpoint stream. - if (hasRowGroupDictionary) { - throw new OrcCorruptionException(e, "ORC file %s has corrupt checkpoints", orcDataSource); - } - } - } + // select the row groups matching the tuple domain + Set selectedRowGroups = selectRowGroups(stripe, columnIndexes); - ImmutableMap.Builder diskRangesBuilder = ImmutableMap.builder(); - for (Entry entry : getDiskRanges(stripeFooter.getStreams()).entrySet()) { - StreamId streamId = entry.getKey(); - if (streamId.getStreamKind() != ROW_INDEX && streams.keySet().contains(streamId)) { - diskRangesBuilder.put(entry); - } + // if all row groups are skipped, return null + if (selectedRowGroups.isEmpty()) { + return null; } - ImmutableMap diskRanges = diskRangesBuilder.build(); - - // read the file regions - Map streamsData = readDiskRanges(stripe.getOffset(), diskRanges); // value streams Map> valueStreams = createValueStreams(streams, streamsData, columnEncodings); @@ -159,41 +111,40 @@ public Stripe readStripe(StripeInformation stripe) // build the dictionary streams StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings); - // build the row group - ImmutableMap.Builder> builder = ImmutableMap.builder(); - for (Entry> entry : valueStreams.entrySet()) { - builder.put(entry.getKey(), new ValueStreamSource>(entry.getValue())); - } - RowGroup rowGroup = new RowGroup(0, stripe.getNumberOfRows(), new StreamSources(builder.build())); + // build the row groups + List rowGroups = createRowGroups( + stripe.getNumberOfRows(), + streams, + valueStreams, + columnIndexes, + selectedRowGroups, + columnEncodings); - return new Stripe(stripe.getNumberOfRows(), columnEncodings, ImmutableList.of(rowGroup), dictionaryStreamSources); + return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources); } - public Map readDiskRanges(long stripeOffset, Map diskRanges) + public Map readDiskRanges(final long stripeOffset, Map diskRanges) throws IOException { - // - // Note: this code does not use the Java 8 stream APIs to avoid any extra object allocation - // - // transform ranges to have an absolute offset in file - ImmutableMap.Builder diskRangesBuilder = ImmutableMap.builder(); - for (Entry entry : diskRanges.entrySet()) { - DiskRange diskRange = entry.getValue(); - diskRangesBuilder.put(entry.getKey(), new DiskRange(stripeOffset + diskRange.getOffset(), diskRange.getLength())); - } - diskRanges = diskRangesBuilder.build(); + diskRanges = Maps.transformValues(diskRanges, new Function() { + @Override + public DiskRange apply(DiskRange diskRange) + { + return new DiskRange(stripeOffset + diskRange.getOffset(), diskRange.getLength()); + } + }); - // read ranges - Map streamsData = orcDataSource.readFully(diskRanges); + Map streamsData = orcDataSource.readFully(diskRanges); - // transform streams to OrcInputStream - String sourceName = orcDataSource.toString(); - ImmutableMap.Builder streamsBuilder = ImmutableMap.builder(); - for (Entry entry : streamsData.entrySet()) { - streamsBuilder.put(entry.getKey(), new OrcInputStream(sourceName, entry.getValue(), compressionKind, bufferSize)); - } - return streamsBuilder.build(); + return ImmutableMap.copyOf(Maps.transformValues(streamsData, new Function() + { + @Override + public OrcInputStream apply(Slice input) + { + return new OrcInputStream(orcDataSource.toString(), input.getInput(), compressionKind, bufferSize); + } + })); } private Map> createValueStreams(Map streams, Map streamsData, List columnEncodings) @@ -253,7 +204,6 @@ private List createRowGroups( Map> columnIndexes, Set selectedRowGroups, List encodings) - throws InvalidCheckpointException { ImmutableList.Builder rowGroupBuilder = ImmutableList.builder(); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java index f5396dfee1..4fd403e643 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java @@ -13,9 +13,9 @@ */ package org.apache.tajo.storage.thirdparty.orc.checkpoint; +import com.google.common.base.MoreObjects; import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import com.google.common.base.MoreObjects; import static com.google.common.base.Preconditions.checkNotNull; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java index 0be5955fda..a76d5c286e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java @@ -13,9 +13,9 @@ */ package org.apache.tajo.storage.thirdparty.orc.checkpoint; +import com.google.common.base.MoreObjects; import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import com.google.common.base.MoreObjects; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java index 9a12b14d9d..c7a93ea169 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java @@ -13,13 +13,13 @@ */ package org.apache.tajo.storage.thirdparty.orc.checkpoint; +import com.google.common.base.MoreObjects; import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import com.google.common.base.MoreObjects; +import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; -import static com.google.common.base.Preconditions.checkNotNull; public final class ByteStreamCheckpoint implements StreamCheckpoint diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java index 847d950e35..f346235d94 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java @@ -13,22 +13,23 @@ */ package org.apache.tajo.storage.thirdparty.orc.checkpoint; +import com.google.common.collect.*; import org.apache.tajo.storage.thirdparty.orc.StreamId; import org.apache.tajo.storage.thirdparty.orc.metadata.*; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; -import com.google.common.collect.*; import java.util.List; import java.util.Map; import java.util.Set; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Predicates.equalTo; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static com.google.common.base.Preconditions.checkNotNull; -import static com.google.common.base.Predicates.equalTo; public final class Checkpoints { @@ -44,7 +45,6 @@ public static Map getStreamCheckpoints( List columnEncodings, Map streams, Map> columnIndexes) - throws InvalidCheckpointException { ImmutableSetMultimap.Builder streamKindsBuilder = ImmutableSetMultimap.builder(); for (Stream stream : streams.values()) { @@ -105,13 +105,12 @@ public static Map getStreamCheckpoints( // it will write checkpoints for all streams, but in other cases it will write only the streams that exist. // We detect this case by checking that all offsets in the initial position list are zero, and if so, we // clear the extra offsets - if (columnPositionsList.hasNextPosition() && !Iterables.all(positionsList, equalTo(0))) { - throw new InvalidCheckpointException(String.format("Column %s, of type %s, contains %s offset positions, but only %s positions were consumed", - column, - columnType, - positionsList.size(), - columnPositionsList.getIndex())); - } + checkState(!columnPositionsList.hasNextPosition() || Iterables.all(positionsList, equalTo(0)), + "Column %s, of type %s, contains %s offset positions, but only %s positions were consumed", + column, + columnType, + positionsList.size(), + columnPositionsList.getIndex()); } return checkpoints.build(); } @@ -396,11 +395,9 @@ public boolean hasNextPosition() public int nextPosition() { - if (!hasNextPosition()) { - throw new InvalidCheckpointException("Not enough positions for column %s, of type %s, checkpoints", - column, - columnType); - } + checkState(hasNextPosition(), "Not enough positions for column %s, of type %s, checkpoints", + column, + columnType); return positionsList.get(index++); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java index 62bf0413a5..80f03de1d9 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java @@ -13,9 +13,9 @@ */ package org.apache.tajo.storage.thirdparty.orc.checkpoint; +import com.google.common.base.MoreObjects; import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import com.google.common.base.MoreObjects; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java index 4edb28787a..2d92cd3494 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java @@ -13,9 +13,9 @@ */ package org.apache.tajo.storage.thirdparty.orc.checkpoint; +import com.google.common.base.MoreObjects; import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import com.google.common.base.MoreObjects; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java index b9bf773163..92550a6b91 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java @@ -13,9 +13,9 @@ */ package org.apache.tajo.storage.thirdparty.orc.checkpoint; +import com.google.common.base.MoreObjects; import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import com.google.common.base.MoreObjects; import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.UNCOMPRESSED; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java index 655641f412..bb08edd940 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java @@ -13,9 +13,9 @@ */ package org.apache.tajo.storage.thirdparty.orc.checkpoint; +import com.google.common.base.MoreObjects; import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import com.google.common.base.MoreObjects; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java index 2b7a56b850..410f181d38 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java @@ -13,13 +13,13 @@ */ package org.apache.tajo.storage.thirdparty.orc.checkpoint; +import com.google.common.base.MoreObjects; import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import com.google.common.base.MoreObjects; +import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; -import static com.google.common.base.Preconditions.checkNotNull; public class LongStreamV1Checkpoint implements LongStreamCheckpoint diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java index 680a8982e6..352c4d1bc1 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java @@ -13,13 +13,13 @@ */ package org.apache.tajo.storage.thirdparty.orc.checkpoint; +import com.google.common.base.MoreObjects; import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import com.google.common.base.MoreObjects; +import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; -import static com.google.common.base.Preconditions.checkNotNull; public final class LongStreamV2Checkpoint implements LongStreamCheckpoint diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java index 97d28e0e19..88ac0515e5 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java @@ -13,9 +13,9 @@ */ package org.apache.tajo.storage.thirdparty.orc.checkpoint; +import com.google.common.base.MoreObjects; import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import com.google.common.base.MoreObjects; import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java new file mode 100644 index 0000000000..65182d49bd --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java @@ -0,0 +1,117 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class BooleanJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private BooleanStream dataStream; + + public BooleanJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + generator.writeBoolean(dataStream.nextBit()); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + return String.valueOf(dataStream.nextBit()); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null values + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, BooleanStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java new file mode 100644 index 0000000000..d1008528a1 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java @@ -0,0 +1,118 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.ByteStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class ByteJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private ByteStream dataStream; + + public ByteJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + generator.writeNumber(dataStream.next()); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + return String.valueOf(dataStream.next()); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null values + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java new file mode 100644 index 0000000000..3243ead772 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java @@ -0,0 +1,123 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class DateJsonReader + implements JsonMapKeyReader +{ + private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1); + + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private LongStream dataStream; + + public DateJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + long millis = dataStream.next() * MILLIS_IN_DAY; + generator.writeNumber(millis); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + long millis = dataStream.next() * MILLIS_IN_DAY; + return String.valueOf(millis); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null values + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java new file mode 100644 index 0000000000..1adf00aeec --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java @@ -0,0 +1,120 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.DoubleStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class DoubleJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private DoubleStream dataStream; + + public DoubleJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + double value = dataStream.next(); + generator.writeNumber(value); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + double value = dataStream.next(); + return String.valueOf(value); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null values + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, DoubleStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java new file mode 100644 index 0000000000..0b4f668dff --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java @@ -0,0 +1,122 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.FloatStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class FloatJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private FloatStream dataStream; + + public FloatJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // write value as a double to avoid strange rounding errors + double value = dataStream.next(); + generator.writeNumber(value); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // write value as a double to avoid strange rounding errors + double value = dataStream.next(); + return String.valueOf(value); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null values + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, FloatStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InvalidCheckpointException.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java similarity index 65% rename from tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InvalidCheckpointException.java rename to tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java index e8438369a0..6e93f8abb2 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InvalidCheckpointException.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java @@ -11,15 +11,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; +package org.apache.tajo.storage.thirdparty.orc.json; -import static java.lang.String.format; +import java.io.IOException; -public class InvalidCheckpointException - extends RuntimeException +public interface JsonMapKeyReader + extends JsonReader { - public InvalidCheckpointException(String message, Object... arguments) - { - super(format(message, arguments)); - } + String nextValueAsMapKey() + throws IOException; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java new file mode 100644 index 0000000000..f35cbe6d82 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.io.IOException; +import java.util.List; + +public interface JsonReader +{ + void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException; + + void openRowGroup(StreamSources dataStreamSources) + throws IOException; + + void readNextValueInto(JsonGenerator generator) + throws IOException; + + void skip(int skipSize) + throws IOException; +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java new file mode 100644 index 0000000000..06019757d2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java @@ -0,0 +1,100 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.joda.time.DateTimeZone; + +public final class JsonReaders +{ + private JsonReaders() + { + } + + public static JsonMapKeyReader createJsonMapKeyReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) + { + switch (streamDescriptor.getStreamType()) { + case BOOLEAN: + return new BooleanJsonReader(streamDescriptor); + case BYTE: + return new ByteJsonReader(streamDescriptor); + case SHORT: + case INT: + case LONG: + return new LongJsonReader(streamDescriptor); + case FLOAT: + return new FloatJsonReader(streamDescriptor); + case DOUBLE: + return new DoubleJsonReader(streamDescriptor); + case BINARY: + return new SliceJsonReader(streamDescriptor, true); + case STRING: + return new SliceJsonReader(streamDescriptor, false); + case TIMESTAMP: + return new TimestampJsonReader(streamDescriptor, hiveStorageTimeZone); + case DATE: + return new DateJsonReader(streamDescriptor); + case STRUCT: + case LIST: + case MAP: + case UNION: + case DECIMAL: + case VARCHAR: + case CHAR: + default: + throw new IllegalArgumentException("Unsupported map key type: " + streamDescriptor.getStreamType()); + } + } + + public static JsonReader createJsonReader( + StreamDescriptor streamDescriptor, + boolean checkForNulls, + DateTimeZone hiveStorageTimeZone) + { + switch (streamDescriptor.getStreamType()) { + case BOOLEAN: + return new BooleanJsonReader(streamDescriptor); + case BYTE: + return new ByteJsonReader(streamDescriptor); + case SHORT: + case INT: + case LONG: + return new LongJsonReader(streamDescriptor); + case FLOAT: + return new FloatJsonReader(streamDescriptor); + case DOUBLE: + return new DoubleJsonReader(streamDescriptor); + case BINARY: + return new SliceJsonReader(streamDescriptor, true); + case STRING: + return new SliceJsonReader(streamDescriptor, false); + case TIMESTAMP: + return new TimestampJsonReader(streamDescriptor, hiveStorageTimeZone); + case DATE: + return new DateJsonReader(streamDescriptor); + case STRUCT: + return new StructJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); + case LIST: + return new ListJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); + case MAP: + return new MapJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); + case UNION: + case DECIMAL: + case VARCHAR: + case CHAR: + default: + throw new IllegalArgumentException("Unsupported type: " + streamDescriptor.getStreamType()); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java new file mode 100644 index 0000000000..d6302fb8b5 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java @@ -0,0 +1,125 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.LENGTH; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class ListJsonReader + implements JsonReader +{ + private final StreamDescriptor streamDescriptor; + private final boolean checkForNulls; + + private final JsonReader elementReader; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private LongStream lengthStream; + + public ListJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.checkForNulls = checkForNulls; + + elementReader = createJsonReader(streamDescriptor.getNestedStreams().get(0), true, hiveStorageTimeZone); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + long length = lengthStream.next(); + generator.writeStartArray(); + for (int i = 0; i < length; i++) { + elementReader.readNextValueInto(generator); + } + generator.writeEndArray(); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + long elementSkipSize = lengthStream.sum(skipSize); + elementReader.skip(Ints.checkedCast(elementSkipSize)); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + lengthStream = null; + + elementReader.openStripe(dictionaryStreamSources, encoding); + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + if (checkForNulls) { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + } + lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); + + elementReader.openRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java new file mode 100644 index 0000000000..b26fc9ab5b --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java @@ -0,0 +1,142 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; + +public class LongDictionaryJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + @Nullable + private BooleanStream inDictionaryStream; + @Nullable + private LongStream dataStream; + + @Nonnull + private long[] dictionary = new long[0]; + + public LongDictionaryJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + generator.writeNumber(nextValue()); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + return String.valueOf(nextValue()); + } + + private long nextValue() + throws IOException + { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + long value = dataStream.next(); + if (inDictionaryStream == null || inDictionaryStream.nextBit()) { + value = dictionary[((int) value)]; + } + return value; + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + // skip non-null values + if (inDictionaryStream != null) { + inDictionaryStream.skip(skipSize); + } + if (skipSize > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.skip(skipSize); + } + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + int dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); + if (dictionarySize > 0) { + if (dictionary.length < dictionarySize) { + dictionary = new long[dictionarySize]; + } + + LongStream dictionaryStream = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, LongStream.class).openStream(); + verifyFormat(dictionaryStream != null, "Dictionary is not empty but data stream is not present"); + dictionaryStream.nextLongVector(dictionarySize, dictionary); + } + + presentStream = null; + inDictionaryStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + inDictionaryStream = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java new file mode 100644 index 0000000000..b6edb82db2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java @@ -0,0 +1,112 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class LongDirectJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + @Nullable + private LongStream dataStream; + + public LongDirectJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + generator.writeNumber(dataStream.next()); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + return String.valueOf(dataStream.next()); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + // skip non-null values + if (skipSize > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.skip(skipSize); + } + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java new file mode 100644 index 0000000000..4793a11280 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java @@ -0,0 +1,99 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; + +public class LongJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + private final LongDirectJsonReader directReader; + + private final LongDictionaryJsonReader dictionaryReader; + private JsonMapKeyReader currentReader; + + public LongJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + directReader = new LongDirectJsonReader(streamDescriptor); + dictionaryReader = new LongDictionaryJsonReader(streamDescriptor); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + currentReader.readNextValueInto(generator); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + return currentReader.nextValueAsMapKey(); + } + + @Override + public void skip(int skipSize) + throws IOException + { + currentReader.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + ColumnEncodingKind kind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); + if (kind == DIRECT || kind == DIRECT_V2 || kind == DWRF_DIRECT) { + currentReader = directReader; + } + else if (kind == DICTIONARY || kind == DICTIONARY_V2) { + currentReader = dictionaryReader; + } + else { + throw new IllegalArgumentException("Unsupported encoding " + kind); + } + + currentReader.openStripe(dictionaryStreamSources, encoding); + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + currentReader.openRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java new file mode 100644 index 0000000000..5b6b73b055 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java @@ -0,0 +1,138 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonMapKeyReader; +import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.LENGTH; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class MapJsonReader + implements JsonReader +{ + private final StreamDescriptor streamDescriptor; + private final boolean checkForNulls; + + private final JsonMapKeyReader keyReader; + private final JsonReader valueReader; + + @Nullable + private BooleanStream presentStream; + @Nullable + private LongStream lengthStream; + + public MapJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.checkForNulls = checkForNulls; + + keyReader = createJsonMapKeyReader(streamDescriptor.getNestedStreams().get(0), hiveStorageTimeZone); + valueReader = createJsonReader(streamDescriptor.getNestedStreams().get(1), true, hiveStorageTimeZone); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + long length = lengthStream.next(); + generator.writeStartObject(); + for (int i = 0; i < length; i++) { + String name = keyReader.nextValueAsMapKey(); + if (name == null) { + valueReader.skip(1); + } + else { + generator.writeFieldName(name); + valueReader.readNextValueInto(generator); + } + } + generator.writeEndObject(); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + // skip non-null values + long elementSkipSize = lengthStream.sum(skipSize); + keyReader.skip(Ints.checkedCast(elementSkipSize)); + valueReader.skip(Ints.checkedCast(elementSkipSize)); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + lengthStream = null; + + keyReader.openStripe(dictionaryStreamSources, encoding); + valueReader.openStripe(dictionaryStreamSources, encoding); + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + if (checkForNulls) { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + } + lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); + + keyReader.openRowGroup(dataStreamSources); + valueReader.openRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java new file mode 100644 index 0000000000..bf7cb6fc13 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java @@ -0,0 +1,269 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.google.common.io.BaseEncoding; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.*; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static java.nio.charset.StandardCharsets.UTF_8; + +public class SliceDictionaryJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + private final boolean writeBinary; + + @Nonnull + private DictionaryEntry[] dictionary = new DictionaryEntry[0]; + + @Nonnull + private int[] dictionaryLength = new int[0]; + + @Nonnull + private DictionaryEntry[] rowGroupDictionary = new DictionaryEntry[0]; + + @Nonnull + private int[] rowGroupDictionaryLength = new int[0]; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private BooleanStream inDictionaryStream; + + @Nullable + private LongStream dataStream; + + public SliceDictionaryJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.writeBinary = writeBinary; + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + DictionaryEntry value = getNextValue(); + + byte[] data = value.getData(); + int offset = value.getOffset(); + int length = value.length(); + if (writeBinary) { + generator.writeBinary(data, offset, length); + } + else { + generator.writeUTF8String(data, offset, length); + } + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + DictionaryEntry value = getNextValue(); + + byte[] data = value.getData(); + int offset = value.getOffset(); + int length = value.length(); + if (writeBinary) { + return BaseEncoding.base64().encode(data, offset, length); + } + else { + return new String(data, offset, length, UTF_8); + } + } + + private DictionaryEntry getNextValue() + throws IOException + { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + int dictionaryIndex = Ints.checkedCast(dataStream.next()); + + DictionaryEntry value; + if (inDictionaryStream == null || inDictionaryStream.nextBit()) { + value = dictionary[dictionaryIndex]; + } + else { + value = rowGroupDictionary[dictionaryIndex]; + } + return value; + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null length + if (inDictionaryStream != null) { + inDictionaryStream.skip(skipSize); + } + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + int dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); + if (dictionarySize > 0) { + // resize the dictionary array if necessary + if (dictionary.length < dictionarySize) { + dictionary = new DictionaryEntry[dictionarySize]; + dictionaryLength = new int[dictionarySize]; + } + + LongStream lengthStream = dictionaryStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); + verifyFormat(lengthStream != null, "Dictionary is not empty but length stream is not present"); + lengthStream.nextIntVector(dictionarySize, dictionaryLength); + + ByteArrayStream dictionaryDataStream = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, ByteArrayStream.class).openStream(); + readDictionary(dictionaryDataStream, dictionarySize, dictionaryLength, dictionary); + } + + presentStream = null; + dataStream = null; + inDictionaryStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + RowGroupDictionaryLengthStream lengthStream = dataStreamSources.getStreamSource( + streamDescriptor, + ROW_GROUP_DICTIONARY_LENGTH, + RowGroupDictionaryLengthStream.class).openStream(); + + if (lengthStream == null) { + inDictionaryStream = null; + } + else { + inDictionaryStream = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class).openStream(); + + int dictionaryEntryCount = lengthStream.getEntryCount(); + + // resize the dictionary array if necessary + if (rowGroupDictionary.length < dictionaryEntryCount) { + rowGroupDictionary = new DictionaryEntry[dictionaryEntryCount]; + rowGroupDictionaryLength = new int[dictionaryEntryCount]; + } + + // read the lengths + lengthStream.nextIntVector(dictionaryEntryCount, rowGroupDictionaryLength); + + ByteArrayStream dictionaryDataStream = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY, ByteArrayStream.class).openStream(); + readDictionary(dictionaryDataStream, dictionaryEntryCount, rowGroupDictionaryLength, rowGroupDictionary); + } + + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); + } + + private static void readDictionary(ByteArrayStream dictionaryDataStream, int dictionarySize, int[] dictionaryLength, DictionaryEntry[] dictionary) + throws IOException + { + // sum lengths + int totalLength = 0; + for (int i = 0; i < dictionarySize; i++) { + totalLength += dictionaryLength[i]; + } + + // read dictionary data + byte[] dictionaryData = new byte[0]; + if (totalLength > 0) { + verifyFormat(dictionaryDataStream != null, "Dictionary length is not zero but dictionary data stream is not present"); + dictionaryData = dictionaryDataStream.next(totalLength); + } + + // build dictionary slices + int offset = 0; + for (int i = 0; i < dictionarySize; i++) { + int length = dictionaryLength[i]; + dictionary[i] = new DictionaryEntry(dictionaryData, offset, length); + offset += length; + } + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } + + private static class DictionaryEntry + { + private final byte[] dictionary; + private final int offset; + private final int length; + + public DictionaryEntry(byte[] dictionary, int offset, int length) + { + this.dictionary = dictionary; + this.offset = offset; + this.length = length; + } + + public int length() + { + return length; + } + + public byte[] getData() + { + return dictionary; + } + + public int getOffset() + { + return offset; + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java new file mode 100644 index 0000000000..6f6630c59e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java @@ -0,0 +1,168 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.google.common.io.BaseEncoding; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.ByteArrayStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static java.nio.charset.StandardCharsets.UTF_8; + +public class SliceDirectJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + private final boolean writeBinary; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private LongStream lengthStream; + + @Nullable + private ByteArrayStream dataStream; + + @Nonnull + private byte[] data = new byte[1024]; + + public SliceDirectJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.writeBinary = writeBinary; + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + int length = bufferNextValue(); + + if (writeBinary) { + generator.writeBinary(data, 0, length); + } + else { + generator.writeUTF8String(data, 0, length); + } + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + int length = bufferNextValue(); + + if (writeBinary) { + return BaseEncoding.base64().encode(data, 0, length); + } + else { + return new String(data, 0, length, UTF_8); + } + } + + private int bufferNextValue() + throws IOException + { + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + int length = Ints.checkedCast(lengthStream.next()); + if (data.length < length) { + data = new byte[length]; + } + + if (length > 0) { + verifyFormat(dataStream != null, "Length is not zero but data stream is not present"); + dataStream.next(length, data); + } + return length; + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + // skip non-null length + long dataSkipSize = lengthStream.sum(skipSize); + + if (dataSkipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Length is not zero but data stream is not present"); + + // skip data bytes + dataStream.skip(Ints.checkedCast(dataSkipSize)); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + lengthStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteArrayStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java new file mode 100644 index 0000000000..68892ca244 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java @@ -0,0 +1,98 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; + +public class SliceJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + private final SliceDirectJsonReader directReader; + private final SliceDictionaryJsonReader dictionaryReader; + private JsonMapKeyReader currentReader; + + public SliceJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + directReader = new SliceDirectJsonReader(streamDescriptor, writeBinary); + dictionaryReader = new SliceDictionaryJsonReader(streamDescriptor, writeBinary); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + currentReader.readNextValueInto(generator); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + return currentReader.nextValueAsMapKey(); + } + + @Override + public void skip(int skipSize) + throws IOException + { + currentReader.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, + List encoding) + throws IOException + { + ColumnEncodingKind columnEncodingKind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); + if (columnEncodingKind == DIRECT || columnEncodingKind == DIRECT_V2 || columnEncodingKind == ColumnEncodingKind.DWRF_DIRECT) { + currentReader = directReader; + } + else if (columnEncodingKind == DICTIONARY || columnEncodingKind == DICTIONARY_V2) { + currentReader = dictionaryReader; + } + else { + throw new IllegalArgumentException("Unsupported encoding " + columnEncodingKind); + } + + currentReader.openStripe(dictionaryStreamSources, encoding); + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + currentReader.openRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java new file mode 100644 index 0000000000..600b7b778d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java @@ -0,0 +1,117 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class StructJsonReader + implements JsonReader +{ + private final StreamDescriptor streamDescriptor; + private final boolean checkForNulls; + private final JsonReader[] structFields; + + @Nullable + private BooleanStream presentStream; + + public StructJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.checkForNulls = checkForNulls; + + List nestedStreams = streamDescriptor.getNestedStreams(); + this.structFields = new JsonReader[nestedStreams.size()]; + for (int i = 0; i < nestedStreams.size(); i++) { + StreamDescriptor nestedStream = nestedStreams.get(i); + this.structFields[i] = createJsonReader(nestedStream, true, hiveStorageTimeZone); + } + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + generator.writeStartArray(); + for (JsonReader structField : structFields) { + structField.readNextValueInto(generator); + } + generator.writeEndArray(); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + // skip non-null values + for (JsonReader structField : structFields) { + structField.skip(skipSize); + } + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + + for (JsonReader structField : structFields) { + structField.openStripe(dictionaryStreamSources, encoding); + } + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + if (checkForNulls) { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + } + + for (JsonReader structField : structFields) { + structField.openRowGroup(dataStreamSources); + } + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java new file mode 100644 index 0000000000..bfebf78658 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java @@ -0,0 +1,134 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.reader.TimestampStreamReader.decodeTimestamp; + +public class TimestampJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + private final long baseTimestampInSeconds; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private LongStream secondsStream; + + @Nullable + private LongStream nanosStream; + + public TimestampJsonReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.baseTimestampInSeconds = new DateTime(2015, 1, 1, 0, 0, checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null")).getMillis() / 1000; + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); + + long timestamp = decodeTimestamp(secondsStream.next(), nanosStream.next(), baseTimestampInSeconds); + generator.writeNumber(timestamp); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); + + long timestamp = decodeTimestamp(secondsStream.next(), nanosStream.next(), baseTimestampInSeconds); + return String.valueOf(timestamp); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); + + // skip non-null values + secondsStream.skip(skipSize); + nanosStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + secondsStream = null; + nanosStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + secondsStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); + nanosStream = dataStreamSources.getStreamSource(streamDescriptor, SECONDARY, LongStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java index 73edd32004..20ae97058e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java @@ -16,25 +16,21 @@ import com.facebook.hive.orc.OrcProto; import com.facebook.hive.orc.OrcProto.ColumnEncoding.Kind; import com.google.common.base.Function; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.primitives.Ints; import com.google.protobuf.CodedInputStream; -import io.airlift.slice.Slice; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; -import javax.annotation.Nullable; import java.io.IOException; import java.io.InputStream; import java.util.List; -import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; -import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcMetadataReader.getMaxSlice; -import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcMetadataReader.getMinSlice; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; +import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; public class DwrfMetadataReader implements MetadataReader @@ -77,13 +73,12 @@ public Footer readFooter(InputStream inputStream) private static List toStripeInformation(List types) { - // Modifying for JDK 1.6 - // return ImmutableList.copyOf(Iterables.transform(types, DwrfMetadataReader::toStripeInformation)); - return ImmutableList.copyOf(Iterables.transform(types, new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(types, new Function() + { @Override - public StripeInformation apply(OrcProto.StripeInformation stripeInformation) { - return toStripeInformation(stripeInformation); + public StripeInformation apply(OrcProto.StripeInformation type) + { + return toStripeInformation(type); } })); } @@ -114,12 +109,11 @@ private static Stream toStream(OrcProto.Stream stream) private static List toStream(List streams) { - // Modifying for JDK 1.6 - // return ImmutableList.copyOf(Iterables.transform(streams, DwrfMetadataReader::toStream)); - return ImmutableList.copyOf(Iterables.transform(streams, new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(streams, new Function() + { @Override - public Stream apply(@Nullable OrcProto.Stream stream) { + public Stream apply(OrcProto.Stream stream) + { return toStream(stream); } })); @@ -148,13 +142,11 @@ public List readRowIndexes(InputStream inputStream) { CodedInputStream input = CodedInputStream.newInstance(inputStream); OrcProto.RowIndex rowIndex = OrcProto.RowIndex.parseFrom(input); - - // Modifying for JDK 1.6 - // return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), DwrfMetadataReader::toRowGroupIndex)); - return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() + { @Override - public RowGroupIndex apply(@Nullable OrcProto.RowIndexEntry rowIndexEntry) { + public RowGroupIndex apply(OrcProto.RowIndexEntry rowIndexEntry) + { return toRowGroupIndex(rowIndexEntry); } })); @@ -180,13 +172,11 @@ private static List toColumnStatistics(List toColumnStatistics(statistics, isRowGroup))); - return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() + { @Override - public ColumnStatistics apply(@Nullable OrcProto.ColumnStatistics columnStatistics) { + public ColumnStatistics apply(OrcProto.ColumnStatistics columnStatistics) + { return toColumnStatistics(columnStatistics, isRowGroup); } })); @@ -229,10 +219,10 @@ private static DoubleStatistics toDoubleStatistics(OrcProto.DoubleStatistics dou return null; } - // if either min, max, or sum is NaN, ignore the stat + // TODO remove this when double statistics are changed to correctly deal with NaNs + // if either min or max is NaN, ignore the stat if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) || - (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum())) || - (doubleStatistics.hasSum() && Double.isNaN(doubleStatistics.getSum()))) { + (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum()))) { return null; } @@ -252,10 +242,15 @@ private static StringStatistics toStringStatistics(OrcProto.StringStatistics str return null; } - Slice minimum = stringStatistics.hasMinimum() ? getMinSlice(stringStatistics.getMinimum()) : null; - Slice maximum = stringStatistics.hasMaximum() ? getMaxSlice(stringStatistics.getMaximum()) : null; + // temporarily disable string statistics until we figure out the implications of how UTF-16 + // strings are compared when they contain surrogate pairs and replacement characters + if (true) { + return null; + } - return new StringStatistics(minimum, maximum); + return new StringStatistics( + stringStatistics.hasMinimum() ? stringStatistics.getMinimum() : null, + stringStatistics.hasMaximum() ? stringStatistics.getMaximum() : null); } private static OrcType toType(OrcProto.Type type) @@ -265,12 +260,11 @@ private static OrcType toType(OrcProto.Type type) private static List toType(List types) { - // Modifying for JDK 1.6 - // return ImmutableList.copyOf(Iterables.transform(types, DwrfMetadataReader::toType)); - return ImmutableList.copyOf(Iterables.transform(types, new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(types, new Function() + { @Override - public OrcType apply(@Nullable OrcProto.Type type) { + public OrcType apply(OrcProto.Type type) + { return toType(type); } })); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java index e83221775f..38bae8b8f2 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java @@ -13,34 +13,27 @@ */ package org.apache.tajo.storage.thirdparty.orc.metadata; -import com.facebook.presto.hive.protobuf.CodedInputStream; +import com.facebook.presto.hive.shaded.com.google.protobuf.CodedInputStream; import com.google.common.base.Function; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; -import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterables; import com.google.common.primitives.Ints; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; import org.apache.hadoop.hive.ql.io.orc.OrcProto; import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; -import javax.annotation.Nullable; import java.io.IOException; import java.io.InputStream; import java.util.List; -import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; import static com.google.common.base.Preconditions.checkState; -import static java.lang.Character.MIN_SURROGATE; +import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; public class OrcMetadataReader implements MetadataReader { - private static final Slice MAX_BYTE = Slices.wrappedBuffer(new byte[] { (byte) 0xFF }); - @Override public PostScript readPostScript(byte[] data, int offset, int length) throws IOException @@ -67,13 +60,12 @@ public Metadata readMetadata(InputStream inputStream) private static List toStripeStatistics(List types) { - // Modifying for JDK 1.6 - // return ImmutableList.copyOf(Iterables.transform(types, OrcMetadataReader::toStripeStatistics)); - return ImmutableList.copyOf(Iterables.transform(types, new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(types, new Function() + { @Override - public StripeStatistics apply(@Nullable OrcProto.StripeStatistics stripeStatistics) { - return new StripeStatistics(toColumnStatistics(stripeStatistics.getColStatsList(), false)); + public StripeStatistics apply(OrcProto.StripeStatistics type) + { + return toStripeStatistics(type); } })); } @@ -99,12 +91,12 @@ public Footer readFooter(InputStream inputStream) private static List toStripeInformation(List types) { - // Modifying for JDK 1.6 - return ImmutableList.copyOf(Iterables.transform(types, new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(types, new Function() + { @Override - public StripeInformation apply(@Nullable OrcProto.StripeInformation stripeInformation) { - return toStripeInformation(stripeInformation); + public StripeInformation apply(OrcProto.StripeInformation type) + { + return toStripeInformation(type); } })); } @@ -135,12 +127,11 @@ private static Stream toStream(OrcProto.Stream stream) private static List toStream(List streams) { - // Modifying for JDK 1.6 - //return ImmutableList.copyOf(Iterables.transform(streams, OrcMetadataReader::toStream)); - return ImmutableList.copyOf(Iterables.transform(streams, new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(streams, new Function() + { @Override - public Stream apply(@Nullable OrcProto.Stream stream) { + public Stream apply(OrcProto.Stream stream) + { return toStream(stream); } })); @@ -153,12 +144,11 @@ private static ColumnEncoding toColumnEncoding(OrcProto.ColumnEncoding columnEnc private static List toColumnEncoding(List columnEncodings) { - // Modifying for JDK 1.6 - // return ImmutableList.copyOf(Iterables.transform(columnEncodings, OrcMetadataReader::toColumnEncoding)); - return ImmutableList.copyOf(Iterables.transform(columnEncodings, new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(columnEncodings, new Function() + { @Override - public ColumnEncoding apply(@Nullable OrcProto.ColumnEncoding columnEncoding) { + public ColumnEncoding apply(OrcProto.ColumnEncoding columnEncoding) + { return toColumnEncoding(columnEncoding); } })); @@ -170,13 +160,11 @@ public List readRowIndexes(InputStream inputStream) { CodedInputStream input = CodedInputStream.newInstance(inputStream); OrcProto.RowIndex rowIndex = OrcProto.RowIndex.parseFrom(input); - - // Modifying for JDK 1.6 - //return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), OrcMetadataReader::toRowGroupIndex)); - return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() + { @Override - public RowGroupIndex apply(@Nullable RowIndexEntry rowIndexEntry) { + public RowGroupIndex apply(RowIndexEntry rowIndexEntry) + { return toRowGroupIndex(rowIndexEntry); } })); @@ -213,13 +201,11 @@ private static List toColumnStatistics(List toColumnStatistics(statistics, isRowGroup))); - return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() + { @Override - public ColumnStatistics apply(@Nullable OrcProto.ColumnStatistics columnStatistics) { + public ColumnStatistics apply(OrcProto.ColumnStatistics columnStatistics) + { return toColumnStatistics(columnStatistics, isRowGroup); } })); @@ -252,10 +238,9 @@ private static DoubleStatistics toDoubleStatistics(OrcProto.DoubleStatistics dou } // TODO remove this when double statistics are changed to correctly deal with NaNs - // if either min, max, or sum is NaN, ignore the stat + // if either min or max is NaN, ignore the stat if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) || - (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum())) || - (doubleStatistics.hasSum() && Double.isNaN(doubleStatistics.getSum()))) { + (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum()))) { return null; } @@ -275,84 +260,9 @@ private static StringStatistics toStringStatistics(OrcProto.StringStatistics str return null; } - /* - The writer performs comparisons using java Strings to determine the minimum and maximum - values. This results in weird behaviors in the presence of surrogate pairs and special characters. - - For example, unicode codepoint 0x1D403 has the following representations: - UTF-16: [0xD835, 0xDC03] - UTF-8: [0xF0, 0x9D, 0x90, 0x83] - - while codepoint 0xFFFD (the replacement character) has the following representations: - UTF-16: [0xFFFD] - UTF-8: [0xEF, 0xBF, 0xBD] - - when comparisons between strings containing these characters are done with Java Strings (UTF-16), - 0x1D403 < 0xFFFD, but when comparisons are done using raw codepoints or UTF-8, 0x1D403 > 0xFFFD - - We use the following logic to ensure that we have a wider range of min-max - * if a min string has a surrogate character, the min string is truncated - at the first occurrence of the surrogate character (to exclude the surrogate character) - * if a max string has a surrogate character, the max string is truncated - at the first occurrence the surrogate character and 0xFF byte is appended to it. - - */ - Slice minimum = stringStatistics.hasMinimum() ? getMinSlice(stringStatistics.getMinimum()) : null; - Slice maximum = stringStatistics.hasMaximum() ? getMaxSlice(stringStatistics.getMaximum()) : null; - - return new StringStatistics(minimum, maximum); - } - - @VisibleForTesting - public static Slice getMaxSlice(String maximum) - { - if (maximum == null) { - return null; - } - - int index = firstSurrogateCharacter(maximum); - if (index == -1) { - return Slices.utf8Slice(maximum); - } - // Append 0xFF so that it is larger than maximum - return concatSlices(Slices.utf8Slice(maximum.substring(0, index)), MAX_BYTE); - } - - @VisibleForTesting - public static Slice getMinSlice(String minimum) - { - if (minimum == null) { - return null; - } - - int index = firstSurrogateCharacter(minimum); - if (index == -1) { - return Slices.utf8Slice(minimum); - } - // truncate the string at the first surrogate character - return Slices.utf8Slice(minimum.substring(0, index)); - } - - // returns index of first surrogateCharacter in the string -1 if no surrogate character is found - @VisibleForTesting - static int firstSurrogateCharacter(String value) - { - char[] chars = value.toCharArray(); - for (int i = 0; i < chars.length; i++) { - if (chars[i] >= MIN_SURROGATE) { - return i; - } - } - return -1; - } - - @VisibleForTesting - static Slice concatSlices(Slice slice1, Slice slice2) - { - Slice slice = Slices.allocate(slice1.length() + slice2.length()); - slice.setBytes(0, slice1.getBytes()); - slice.setBytes(slice1.length(), slice2.getBytes()); - return slice; + return new StringStatistics( + stringStatistics.hasMinimum() ? stringStatistics.getMinimum() : null, + stringStatistics.hasMaximum() ? stringStatistics.getMaximum() : null); } private static DateStatistics toDateStatistics(OrcProto.DateStatistics dateStatistics, boolean isRowGroup) @@ -366,6 +276,12 @@ private static DateStatistics toDateStatistics(OrcProto.DateStatistics dateStati return null; } + // temporarily disable string statistics until we figure out the implications of how UTF-16 + // strings are compared when they contain surrogate pairs and replacement characters + if (true) { + return null; + } + return new DateStatistics( dateStatistics.hasMinimum() ? dateStatistics.getMinimum() : null, dateStatistics.hasMaximum() ? dateStatistics.getMaximum() : null); @@ -378,12 +294,11 @@ private static OrcType toType(OrcProto.Type type) private static List toType(List types) { - // Modifying for JDK 1.6 - // return ImmutableList.copyOf(Iterables.transform(types, OrcMetadataReader::toType)); - return ImmutableList.copyOf(Iterables.transform(types, new Function() { - @Nullable + return ImmutableList.copyOf(Iterables.transform(types, new Function() + { @Override - public OrcType apply(@Nullable OrcProto.Type type) { + public OrcType apply(OrcProto.Type type) + { return toType(type); } })); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java index a61849b8d7..17cb8ba289 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java @@ -13,28 +13,26 @@ */ package org.apache.tajo.storage.thirdparty.orc.metadata; -import io.airlift.slice.Slice; - public class StringStatistics - implements RangeStatistics + implements RangeStatistics { - private final Slice minimum; - private final Slice maximum; + private final String minimum; + private final String maximum; - public StringStatistics(Slice minimum, Slice maximum) + public StringStatistics(String minimum, String maximum) { this.minimum = minimum; this.maximum = maximum; } @Override - public Slice getMin() + public String getMin() { return minimum; } @Override - public Slice getMax() + public String getMax() { return maximum; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java index 6ea9dc1e2e..cb38b2ed6e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java @@ -14,7 +14,6 @@ package org.apache.tajo.storage.thirdparty.orc.reader; import org.apache.tajo.storage.thirdparty.orc.BooleanVector; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; @@ -27,12 +26,12 @@ import java.util.Arrays; import java.util.List; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; public class BooleanStreamReader implements StreamReader @@ -81,27 +80,21 @@ public void readBatch(Object vector) readOffset = presentStream.countBitsSet(readOffset); } if (readOffset > 0) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.skip(readOffset); } } - BooleanVector booleanVector = castOrcVector(vector, BooleanVector.class); + BooleanVector booleanVector = (BooleanVector) vector; if (presentStream == null) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); Arrays.fill(booleanVector.isNull, false); dataStream.getSetBits(nextBatchSize, booleanVector.vector); } else { int nullValues = presentStream.getUnsetBits(nextBatchSize, booleanVector.isNull); if (nullValues != nextBatchSize) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.getSetBits(nextBatchSize, booleanVector.vector, booleanVector.isNull); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java index 14e1d6de9e..3688d2fce2 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java @@ -14,7 +14,6 @@ package org.apache.tajo.storage.thirdparty.orc.reader; import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; @@ -28,12 +27,12 @@ import java.util.Arrays; import java.util.List; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; public class ByteStreamReader implements StreamReader @@ -82,27 +81,21 @@ public void readBatch(Object vector) readOffset = presentStream.countBitsSet(readOffset); } if (readOffset > 0) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.skip(readOffset); } } - LongVector byteVector = castOrcVector(vector, LongVector.class); + LongVector byteVector = (LongVector) vector; if (presentStream == null) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); Arrays.fill(byteVector.isNull, false); dataStream.nextVector(nextBatchSize, byteVector.vector); } else { int nullValues = presentStream.getUnsetBits(nextBatchSize, byteVector.isNull); if (nullValues != nextBatchSize) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.nextVector(nextBatchSize, byteVector.vector, byteVector.isNull); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java index db05a04da0..afca11996d 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java @@ -14,7 +14,6 @@ package org.apache.tajo.storage.thirdparty.orc.reader; import org.apache.tajo.storage.thirdparty.orc.DoubleVector; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; @@ -28,12 +27,12 @@ import java.util.Arrays; import java.util.List; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; public class DoubleStreamReader implements StreamReader @@ -82,27 +81,21 @@ public void readBatch(Object vector) readOffset = presentStream.countBitsSet(readOffset); } if (readOffset > 0) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.skip(readOffset); } } - DoubleVector doubleVector = castOrcVector(vector, DoubleVector.class); + DoubleVector doubleVector = (DoubleVector) vector; if (presentStream == null) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); Arrays.fill(doubleVector.isNull, false); dataStream.nextVector(nextBatchSize, doubleVector.vector); } else { int nullValues = presentStream.getUnsetBits(nextBatchSize, doubleVector.isNull); if (nullValues != nextBatchSize) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.nextVector(nextBatchSize, doubleVector.vector, doubleVector.isNull); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java index f9c55fd78f..8d75390337 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java @@ -14,7 +14,6 @@ package org.apache.tajo.storage.thirdparty.orc.reader; import org.apache.tajo.storage.thirdparty.orc.DoubleVector; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; @@ -28,12 +27,12 @@ import java.util.Arrays; import java.util.List; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; public class FloatStreamReader implements StreamReader @@ -82,28 +81,22 @@ public void readBatch(Object vector) readOffset = presentStream.countBitsSet(readOffset); } if (readOffset > 0) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.skip(readOffset); } } // we could add a float vector but Presto currently doesn't support floats - DoubleVector floatVector = castOrcVector(vector, DoubleVector.class); + DoubleVector floatVector = (DoubleVector) vector; if (presentStream == null) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); Arrays.fill(floatVector.isNull, false); dataStream.nextVector(nextBatchSize, floatVector.vector); } else { int nullValues = presentStream.getUnsetBits(nextBatchSize, floatVector.isNull); if (nullValues != nextBatchSize) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.nextVector(nextBatchSize, floatVector.vector, floatVector.isNull); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java new file mode 100644 index 0000000000..8048e61335 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java @@ -0,0 +1,180 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.SliceVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.json.JsonReader; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import io.airlift.slice.DynamicSliceOutput; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class JsonStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + private final JsonReader jsonReader; + + private boolean stripeOpen; + private boolean rowGroupOpen; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; + + private int readOffset; + private int nextBatchSize; + + @Nullable + private StreamSources dictionaryStreamSources; + @Nullable + private StreamSources dataStreamSources; + + private List encoding; + + public JsonStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.jsonReader = createJsonReader(streamDescriptor, false, hiveStorageTimeZone); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the length reader + readOffset = presentStream.countBitsSet(readOffset); + } + + jsonReader.skip(readOffset); + } + + SliceVector sliceVector = (SliceVector) vector; + if (presentStream != null) { + presentStream.getUnsetBits(nextBatchSize, isNullVector); + } + + DynamicSliceOutput out = new DynamicSliceOutput(1024); + for (int i = 0; i < nextBatchSize; i++) { + if (!isNullVector[i]) { + out.reset(); + JsonGenerator generator = new JsonFactory().createGenerator(out); + jsonReader.readNextValueInto(generator); + sliceVector.vector[i] = out.copySlice(); + } + else { + sliceVector.vector[i] = null; + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + + if (!stripeOpen) { + jsonReader.openStripe(dictionaryStreamSources, encoding); + } + + jsonReader.openRowGroup(dataStreamSources); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + this.dictionaryStreamSources = dictionaryStreamSources; + this.dataStreamSources = null; + this.encoding = encoding; + + presentStreamSource = missingStreamSource(BooleanStream.class); + + stripeOpen = false; + rowGroupOpen = false; + + readOffset = 0; + nextBatchSize = 0; + + Arrays.fill(isNullVector, false); + + presentStream = null; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + this.dataStreamSources = dataStreamSources; + + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + + rowGroupOpen = false; + + readOffset = 0; + nextBatchSize = 0; + + Arrays.fill(isNullVector, false); + + presentStream = null; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java index acb15ba5cb..bd847f6efd 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java @@ -14,7 +14,6 @@ package org.apache.tajo.storage.thirdparty.orc.reader; import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; import org.apache.tajo.storage.thirdparty.orc.Vector; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; @@ -29,11 +28,11 @@ import java.util.Arrays; import java.util.List; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; public class LongDictionaryStreamReader implements StreamReader @@ -100,28 +99,22 @@ public void readBatch(Object vector) } if (readOffset > 0) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.skip(readOffset); } } - LongVector longVector = castOrcVector(vector, LongVector.class); + LongVector longVector = (LongVector) vector; if (presentStream == null) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); Arrays.fill(longVector.isNull, false); dataStream.nextLongVector(nextBatchSize, longVector.vector); } else { int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); if (nullValues != nextBatchSize) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); } } @@ -155,9 +148,7 @@ private void openRowGroup() } LongStream dictionaryStream = dictionaryDataStreamSource.openStream(); - if (dictionaryStream == null) { - throw new OrcCorruptionException("Dictionary is not empty but data stream is not present"); - } + verifyFormat(dictionaryStream != null, "Dictionary is not empty but data stream is not present"); dictionaryStream.nextLongVector(dictionarySize, dictionary); } dictionaryOpen = true; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java index f10c6e0d4f..b50201cc0e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java @@ -14,7 +14,6 @@ package org.apache.tajo.storage.thirdparty.orc.reader; import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; @@ -28,12 +27,12 @@ import java.util.Arrays; import java.util.List; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; public class LongDirectStreamReader implements StreamReader @@ -82,27 +81,21 @@ public void readBatch(Object vector) readOffset = presentStream.countBitsSet(readOffset); } if (readOffset > 0) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.skip(readOffset); } } - LongVector longVector = castOrcVector(vector, LongVector.class); + LongVector longVector = (LongVector) vector; if (presentStream == null) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); Arrays.fill(longVector.isNull, false); dataStream.nextLongVector(nextBatchSize, longVector.vector); } else { int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); if (nullValues != nextBatchSize) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java index 8fc4a4d0dd..6943049acd 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java @@ -21,9 +21,9 @@ import java.io.IOException; import java.util.List; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; public class LongStreamReader implements StreamReader diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/OrcReaderUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/OrcReaderUtils.java deleted file mode 100644 index 84348d18c1..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/OrcReaderUtils.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; - -final class OrcReaderUtils -{ - private OrcReaderUtils() - { - } - - public static T castOrcVector(Object vector, Class type) - throws OrcCorruptionException - { - if (!type.isInstance(vector)) { - throw new OrcCorruptionException("Expected %s, but got %s", type.getSimpleName(), vector.getClass().getName()); - } - return type.cast(vector); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java index 22c6a4d911..bf7f362be5 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java @@ -13,7 +13,6 @@ */ package org.apache.tajo.storage.thirdparty.orc.reader; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; import org.apache.tajo.storage.thirdparty.orc.SliceVector; import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; import org.apache.tajo.storage.thirdparty.orc.Vector; @@ -28,11 +27,11 @@ import java.util.Arrays; import java.util.List; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; public class SliceDictionaryStreamReader implements StreamReader @@ -112,9 +111,7 @@ public void readBatch(Object vector) readOffset = presentStream.countBitsSet(readOffset); } if (readOffset > 0) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); if (inDictionaryStream != null) { inDictionaryStream.skip(readOffset); } @@ -122,21 +119,17 @@ public void readBatch(Object vector) } } - SliceVector sliceVector = castOrcVector(vector, SliceVector.class); + SliceVector sliceVector = (SliceVector) vector; if (presentStream == null) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); Arrays.fill(isNullVector, false); dataStream.nextIntVector(nextBatchSize, dataVector); } else { int nullValues = presentStream.getUnsetBits(nextBatchSize, isNullVector); if (nullValues != nextBatchSize) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.nextIntVector(nextBatchSize, dataVector, isNullVector); } } @@ -177,9 +170,7 @@ private void openRowGroup() // read the lengths LongStream lengthStream = dictionaryLengthStreamSource.openStream(); - if (lengthStream == null) { - throw new OrcCorruptionException("Dictionary is not empty but dictionary length stream is not present"); - } + verifyFormat(lengthStream != null, "Dictionary is not empty but dictionary length stream is not present"); lengthStream.nextIntVector(dictionarySize, dictionaryLength); ByteArrayStream dictionaryDataStream = dictionaryDataStreamSource.openStream(); @@ -216,15 +207,25 @@ private void openRowGroup() private static void readDictionary(@Nullable ByteArrayStream dictionaryDataStream, int dictionarySize, int[] dictionaryLength, Slice[] dictionary) throws IOException { + // sum lengths + int totalLength = 0; + for (int i = 0; i < dictionarySize; i++) { + totalLength += dictionaryLength[i]; + } + + // read dictionary data + byte[] dictionaryData = new byte[0]; + if (totalLength > 0) { + verifyFormat(dictionaryDataStream != null, "Dictionary length is not zero but dictionary data stream is not present"); + dictionaryData = dictionaryDataStream.next(totalLength); + } + // build dictionary slices + int offset = 0; for (int i = 0; i < dictionarySize; i++) { int length = dictionaryLength[i]; - if (length == 0) { - dictionary[i] = Slices.EMPTY_SLICE; - } - else { - dictionary[i] = Slices.wrappedBuffer(dictionaryDataStream.next(length)); - } + dictionary[i] = Slices.wrappedBuffer(dictionaryData, offset, length); + offset += length; } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java index f7f098933d..994b25d29a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java @@ -13,13 +13,12 @@ */ package org.apache.tajo.storage.thirdparty.orc.reader; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import com.google.common.primitives.Ints; import org.apache.tajo.storage.thirdparty.orc.SliceVector; import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; import org.apache.tajo.storage.thirdparty.orc.Vector; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; import org.apache.tajo.storage.thirdparty.orc.stream.*; -import com.google.common.primitives.Ints; import io.airlift.slice.Slices; import javax.annotation.Nonnull; @@ -28,17 +27,15 @@ import java.util.Arrays; import java.util.List; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; public class SliceDirectStreamReader implements StreamReader { - private static final byte[] EMPTY_BYTE_ARRAY = new byte[0]; - private final StreamDescriptor streamDescriptor; private int readOffset; @@ -90,32 +87,24 @@ public void readBatch(Object vector) readOffset = presentStream.countBitsSet(readOffset); } if (readOffset > 0) { - if (lengthStream == null) { - throw new OrcCorruptionException("Value is not null but length stream is not present"); - } + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); long dataSkipSize = lengthStream.sum(readOffset); if (dataSkipSize > 0) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); dataStream.skip(Ints.checkedCast(dataSkipSize)); } } } - SliceVector sliceVector = castOrcVector(vector, SliceVector.class); + SliceVector sliceVector = (SliceVector) vector; if (presentStream == null) { - if (lengthStream == null) { - throw new OrcCorruptionException("Value is not null but length stream is not present"); - } + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); lengthStream.nextIntVector(nextBatchSize, lengthVector); } else { int nullValues = presentStream.getUnsetBits(nextBatchSize, isNullVector); if (nullValues != nextBatchSize) { - if (lengthStream == null) { - throw new OrcCorruptionException("Value is not null but length stream is not present"); - } + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); lengthStream.nextIntVector(nextBatchSize, lengthVector, isNullVector); } } @@ -127,11 +116,9 @@ public void readBatch(Object vector) } } - byte[] data = EMPTY_BYTE_ARRAY; + byte[] data = new byte[0]; if (totalLength > 0) { - if (dataStream == null) { - throw new OrcCorruptionException("Value is not null but data stream is not present"); - } + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); data = dataStream.next(totalLength); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java index 660410f52f..e046dff632 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java @@ -21,9 +21,9 @@ import java.io.IOException; import java.util.List; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; public class SliceStreamReader implements StreamReader diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java new file mode 100644 index 0000000000..7d0e8cc9f2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java @@ -0,0 +1,58 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.joda.time.DateTimeZone; + +public final class StreamReaders +{ + private StreamReaders() + { + } + + public static StreamReader createStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) + { + switch (streamDescriptor.getStreamType()) { + case BOOLEAN: + return new BooleanStreamReader(streamDescriptor); + case BYTE: + return new ByteStreamReader(streamDescriptor); + case SHORT: + case INT: + case LONG: + case DATE: + return new LongStreamReader(streamDescriptor); + case FLOAT: + return new FloatStreamReader(streamDescriptor); + case DOUBLE: + return new DoubleStreamReader(streamDescriptor); + case BINARY: + case STRING: + return new SliceStreamReader(streamDescriptor); + case TIMESTAMP: + return new TimestampStreamReader(streamDescriptor, hiveStorageTimeZone); + case STRUCT: + case LIST: + case MAP: + return new JsonStreamReader(streamDescriptor, hiveStorageTimeZone); + case UNION: + case DECIMAL: + case VARCHAR: + case CHAR: + default: + throw new IllegalArgumentException("Unsupported type: " + streamDescriptor.getStreamType()); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java index 07913335ec..ba96f7cdcb 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java @@ -14,7 +14,6 @@ package org.apache.tajo.storage.thirdparty.orc.reader; import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; import org.apache.tajo.storage.thirdparty.orc.Vector; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; @@ -31,11 +30,11 @@ import java.util.Arrays; import java.util.List; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.reader.OrcReaderUtils.castOrcVector; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; public class TimestampStreamReader implements StreamReader @@ -95,26 +94,18 @@ public void readBatch(Object vector) readOffset = presentStream.countBitsSet(readOffset); } if (readOffset > 0) { - if (secondsStream == null) { - throw new OrcCorruptionException("Value is not null but seconds stream is not present"); - } - if (nanosStream == null) { - throw new OrcCorruptionException("Value is not null but nanos stream is not present"); - } + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); secondsStream.skip(readOffset); nanosStream.skip(readOffset); } } - LongVector longVector = castOrcVector(vector, LongVector.class); + LongVector longVector = (LongVector) vector; if (presentStream == null) { - if (secondsStream == null) { - throw new OrcCorruptionException("Value is not null but seconds stream is not present"); - } - if (nanosStream == null) { - throw new OrcCorruptionException("Value is not null but nanos stream is not present"); - } + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); Arrays.fill(longVector.isNull, false); secondsStream.nextLongVector(nextBatchSize, longVector.vector); @@ -123,12 +114,8 @@ public void readBatch(Object vector) else { int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); if (nullValues != nextBatchSize) { - if (secondsStream == null) { - throw new OrcCorruptionException("Value is not null but seconds stream is not present"); - } - if (nanosStream == null) { - throw new OrcCorruptionException("Value is not null but nanos stream is not present"); - } + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); secondsStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); nanosStream.nextLongVector(nextBatchSize, nanosVector, longVector.isNull); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java index 321e0ff4e4..853609af56 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java @@ -17,9 +17,9 @@ import java.io.IOException; +import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; -import static com.google.common.base.Preconditions.checkNotNull; public class ByteArrayStream implements ValueStream diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java index 8469283fb5..adb27cbeb9 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java @@ -13,12 +13,12 @@ */ package org.apache.tajo.storage.thirdparty.orc.stream; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; import org.apache.tajo.storage.thirdparty.orc.checkpoint.ByteStreamCheckpoint; import java.io.IOException; import java.util.Arrays; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; @@ -44,9 +44,7 @@ private void readNextBlock() lastReadInputCheckpoint = input.getCheckpoint(); int control = input.read(); - if (control == -1) { - throw new OrcCorruptionException("Read past end of buffer RLE byte from %s", input); - } + verifyFormat(control != -1, "Read past end of buffer RLE byte from %s", input); offset = 0; @@ -56,9 +54,7 @@ private void readNextBlock() // read the repeated value int value = input.read(); - if (value == -1) { - throw new OrcCorruptionException("Reading RLE byte got EOF"); - } + verifyFormat(value != -1, "Reading RLE byte got EOF"); // fill buffer with the value Arrays.fill(buffer, 0, length, (byte) value); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java index 89ee357507..6c3e5ea6c9 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java @@ -13,8 +13,8 @@ */ package org.apache.tajo.storage.thirdparty.orc.stream; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; import javax.annotation.Nullable; import java.io.IOException; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java index 1344bc66c0..08f1f160e2 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java @@ -20,9 +20,9 @@ import java.io.IOException; +import static com.google.common.base.Preconditions.checkPositionIndex; import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; -import static com.google.common.base.Preconditions.checkPositionIndex; import static io.airlift.slice.SizeOf.SIZE_OF_DOUBLE; public class DoubleStream diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java index b60bd46103..722c9470fd 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java @@ -20,9 +20,9 @@ import java.io.IOException; +import static com.google.common.base.Preconditions.checkPositionIndex; import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; -import static com.google.common.base.Preconditions.checkPositionIndex; import static io.airlift.slice.SizeOf.SIZE_OF_FLOAT; public class FloatStream diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java index 6ad58b8776..40753bfe75 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java @@ -13,14 +13,15 @@ */ package org.apache.tajo.storage.thirdparty.orc.stream; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; import java.io.IOException; import java.io.InputStream; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind.*; import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.FixedBitSizes.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.FixedBitSizes.*; // This is based on the Apache Hive ORC code public final class LongDecode @@ -124,9 +125,7 @@ public static long readUnsignedVInt(InputStream inputStream) long b; do { b = inputStream.read(); - if (b == -1) { - throw new OrcCorruptionException("EOF while reading unsigned vint"); - } + verifyFormat(b != -1, "EOF while reading unsigned vint"); result |= (b & 0x7F /* 0b0111_1111 */) << offset; offset += 7; } while ((b & 0x80 /* 0b1000_0000 */) != 0); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java index 16c9180fd4..e037be6c3e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java @@ -13,16 +13,16 @@ */ package org.apache.tajo.storage.thirdparty.orc.stream; +import com.google.common.primitives.Ints; import org.apache.tajo.storage.thirdparty.orc.Vector; import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamDwrfCheckpoint; import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import com.google.common.primitives.Ints; import java.io.IOException; -import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.readDwrfLong; import static com.google.common.base.Preconditions.checkPositionIndex; +import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.readDwrfLong; public class LongStreamDwrf implements LongStream diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java index ee12910031..29a6d25ef6 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java @@ -13,13 +13,13 @@ */ package org.apache.tajo.storage.thirdparty.orc.stream; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import com.google.common.primitives.Ints; import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamV1Checkpoint; -import com.google.common.primitives.Ints; import java.io.IOException; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; public class LongStreamV1 @@ -50,18 +50,14 @@ private void readValues() lastReadInputCheckpoint = input.getCheckpoint(); int control = input.read(); - if (control == -1) { - throw new OrcCorruptionException("Read past end of RLE integer from %s", input); - } + verifyFormat(control != -1, "Read past end of RLE integer from %s", input); if (control < 0x80) { numLiterals = control + MIN_REPEAT_SIZE; used = 0; repeat = true; delta = input.read(); - if (delta == -1) { - throw new OrcCorruptionException("End of stream in RLE Integer from %s", input); - } + verifyFormat(delta != -1, "End of stream in RLE Integer from %s", input); // convert from 0 to 255 to -128 to 127 by converting to a signed byte // noinspection SillyAssignment diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java index 87f554787c..f22b3681d2 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java @@ -13,14 +13,14 @@ */ package org.apache.tajo.storage.thirdparty.orc.stream; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import com.google.common.primitives.Ints; import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamV2Checkpoint; -import com.google.common.primitives.Ints; import java.io.IOException; import java.io.InputStream; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; /** @@ -61,9 +61,7 @@ private void readValues() // read the first 2 bits and determine the encoding type int firstByte = input.read(); - if (firstByte < 0) { - throw new OrcCorruptionException("Read past end of RLE integer from %s", input); - } + verifyFormat(firstByte >= 0, "Read past end of RLE integer from %s", input); int enc = (firstByte >>> 6) & 0x03; if (EncodingType.SHORT_REPEAT.ordinal() == enc) { @@ -184,9 +182,7 @@ private void readPatchedBaseValues(int firstByte) // unpack the patch blob long[] unpackedPatch = new long[patchListLength]; - if ((patchWidth + patchGapWidth) > 64 && !skipCorrupt) { - throw new OrcCorruptionException("ORC file is corrupt"); - } + verifyFormat((patchWidth + patchGapWidth) <= 64 || skipCorrupt, "ORC file is corrupt"); int bitSize = LongDecode.getClosestFixedBits(patchWidth + patchGapWidth); readBitPackedLongs(unpackedPatch, 0, patchListLength, bitSize, input); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java index ec69c1a0c2..54472236d8 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java @@ -13,50 +13,50 @@ */ package org.apache.tajo.storage.thirdparty.orc.stream; +import com.google.common.base.MoreObjects; import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import com.google.common.base.MoreObjects; -import com.google.common.primitives.Ints; -import io.airlift.slice.FixedLengthSliceInput; +import io.airlift.slice.BasicSliceInput; import io.airlift.slice.Slice; import io.airlift.slice.Slices; import org.iq80.snappy.Snappy; import java.io.IOException; import java.io.InputStream; -import java.util.Arrays; import java.util.zip.DataFormatException; import java.util.zip.Inflater; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.*; -import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.*; +import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; import static io.airlift.slice.Slices.EMPTY_SLICE; import static sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET; public final class OrcInputStream extends InputStream { - public static final int EXPECTED_COMPRESSION_RATIO = 5; + public static final int BLOCK_HEADER_SIZE = 3; + private final String source; - private final FixedLengthSliceInput compressedSliceInput; + private final BasicSliceInput compressedSliceInput; private final CompressionKind compressionKind; - private final int maxBufferSize; + private final int bufferSize; private int currentCompressedBlockOffset; - private FixedLengthSliceInput current; + private BasicSliceInput current; - private byte[] buffer; + private Slice buffer; - public OrcInputStream(String source, FixedLengthSliceInput sliceInput, CompressionKind compressionKind, int bufferSize) + public OrcInputStream(String source, BasicSliceInput sliceInput, CompressionKind compressionKind, int bufferSize) { this.source = checkNotNull(source, "source is null"); checkNotNull(sliceInput, "sliceInput is null"); this.compressionKind = checkNotNull(compressionKind, "compressionKind is null"); - this.maxBufferSize = bufferSize; + this.bufferSize = bufferSize; if (compressionKind == UNCOMPRESSED) { this.current = sliceInput; @@ -117,7 +117,7 @@ public int read(byte[] b, int off, int length) return -1; } - if (current.remaining() == 0) { + if (!current.isReadable()) { advance(); if (current == null) { return -1; @@ -130,11 +130,11 @@ public int read(byte[] b, int off, int length) public long getCheckpoint() { // if the decompressed buffer is empty, return a checkpoint starting at the next block - if (current == null || (current.position() == 0 && current.remaining() == 0)) { - return createInputStreamCheckpoint(Ints.checkedCast(compressedSliceInput.position()), 0); + if (current == null || (current.position() == 0 && current.available() == 0)) { + return createInputStreamCheckpoint(compressedSliceInput.position(), 0); } // otherwise return a checkpoint at the last compressed block read and the current position in the buffer - return createInputStreamCheckpoint(currentCompressedBlockOffset, Ints.checkedCast(current.position())); + return createInputStreamCheckpoint(currentCompressedBlockOffset, current.position()); } public boolean seekToCheckpoint(long checkpoint) @@ -144,9 +144,7 @@ public boolean seekToCheckpoint(long checkpoint) int decompressedOffset = decodeDecompressedOffset(checkpoint); boolean discardedBuffer; if (compressedBlockOffset != currentCompressedBlockOffset) { - if (compressionKind == UNCOMPRESSED) { - throw new OrcCorruptionException("Reset stream has a compressed block offset but stream is not compressed"); - } + verifyFormat(compressionKind != UNCOMPRESSED, "Reset stream has a compressed block offset but stream is not compressed"); compressedSliceInput.setPosition(compressedBlockOffset); current = EMPTY_SLICE.getInput(); discardedBuffer = true; @@ -157,8 +155,8 @@ public boolean seekToCheckpoint(long checkpoint) if (decompressedOffset != current.position()) { current.setPosition(0); - if (current.remaining() < decompressedOffset) { - decompressedOffset -= current.remaining(); + if (current.available() < decompressedOffset) { + decompressedOffset -= current.available(); advance(); } current.setPosition(decompressedOffset); @@ -188,14 +186,14 @@ public long skip(long n) private void advance() throws IOException { - if (compressedSliceInput == null || compressedSliceInput.remaining() == 0) { + if (compressedSliceInput == null || compressedSliceInput.available() == 0) { current = null; return; } // 3 byte header // NOTE: this must match BLOCK_HEADER_SIZE - currentCompressedBlockOffset = Ints.checkedCast(compressedSliceInput.position()); + currentCompressedBlockOffset = compressedSliceInput.position(); int b0 = compressedSliceInput.readUnsignedByte(); int b1 = compressedSliceInput.readUnsignedByte(); int b2 = compressedSliceInput.readUnsignedByte(); @@ -209,15 +207,19 @@ private void advance() current = chunk.getInput(); } else { + if (buffer == null) { + buffer = Slices.allocate(bufferSize); + } + int uncompressedSize; if (compressionKind == ZLIB) { - uncompressedSize = decompressZip(chunk); + uncompressedSize = decompressZip(chunk, buffer); } else { - uncompressedSize = decompressSnappy(chunk); + uncompressedSize = decompressSnappy(chunk, buffer); } - current = Slices.wrappedBuffer(buffer, 0, uncompressedSize).getInput(); + current = buffer.slice(0, uncompressedSize).getInput(); } } @@ -233,63 +235,40 @@ public String toString() } // This comes from the Apache Hive ORC code - private int decompressZip(Slice in) + private static int decompressZip(Slice in, Slice buffer) throws IOException { + byte[] outArray = (byte[]) buffer.getBase(); + int outOffset = 0; + + byte[] inArray = (byte[]) in.getBase(); + int inOffset = (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET); + int inLength = in.length(); + Inflater inflater = new Inflater(true); - try { - inflater.setInput((byte[]) in.getBase(), (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET), in.length()); - allocateOrGrowBuffer(in.length() * EXPECTED_COMPRESSION_RATIO, false); - int uncompressedLength = 0; - while (true) { - uncompressedLength += inflater.inflate(buffer, uncompressedLength, buffer.length - uncompressedLength); - if (inflater.finished() || buffer.length >= maxBufferSize) { - break; - } - int oldBufferSize = buffer.length; - allocateOrGrowBuffer(buffer.length * 2, true); - if (buffer.length <= oldBufferSize) { - throw new IllegalStateException(String.format("Buffer failed to grow. Old size %d, current size %d", oldBufferSize, buffer.length)); - } + inflater.setInput(inArray, inOffset, inLength); + while (!(inflater.finished() || inflater.needsDictionary() || inflater.needsInput())) { + try { + int count = inflater.inflate(outArray, outOffset, outArray.length - outOffset); + outOffset += count; } - - if (!inflater.finished()) { - throw new OrcCorruptionException("Could not decompress all input (output buffer too small?)"); + catch (DataFormatException e) { + throw new OrcCorruptionException(e, "Invalid compressed stream"); } - - return uncompressedLength; - } - catch (DataFormatException e) { - throw new OrcCorruptionException(e, "Invalid compressed stream"); - } - finally { - inflater.end(); } + inflater.end(); + return outOffset; } - private int decompressSnappy(Slice in) + private static int decompressSnappy(Slice in, Slice buffer) throws IOException { + byte[] outArray = (byte[]) buffer.getBase(); + byte[] inArray = (byte[]) in.getBase(); int inOffset = (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET); int inLength = in.length(); - int uncompressedLength = Snappy.getUncompressedLength(inArray, inOffset); - checkArgument(uncompressedLength <= maxBufferSize, "Snappy requires buffer (%d) larger than max size (%d)", uncompressedLength, maxBufferSize); - allocateOrGrowBuffer(uncompressedLength, false); - - return Snappy.uncompress(inArray, inOffset, inLength, buffer, 0); - } - - private void allocateOrGrowBuffer(int size, boolean copyExistingData) - { - if (buffer == null || buffer.length < size) { - if (copyExistingData && buffer != null) { - buffer = Arrays.copyOfRange(buffer, 0, Math.min(size, maxBufferSize)); - } - else { - buffer = new byte[Math.min(size, maxBufferSize)]; - } - } + return Snappy.uncompress(inArray, inOffset, inLength, outArray, 0); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java index 58b8b86b2f..2f04155d6c 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java @@ -13,13 +13,12 @@ */ package org.apache.tajo.storage.thirdparty.orc.stream; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; - import java.io.IOException; import java.io.InputStream; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; final class OrcStreamUtils { @@ -34,9 +33,7 @@ public static void skipFully(InputStream input, long length) { while (length > 0) { long result = input.skip(length); - if (result < 0) { - throw new OrcCorruptionException("Unexpected end of stream"); - } + verifyFormat(result >= 0, "Unexpected end of stream"); length -= result; } } @@ -46,9 +43,7 @@ public static void readFully(InputStream input, byte[] buffer, int offset, int l { while (offset < length) { int result = input.read(buffer, offset, length - offset); - if (result < 0) { - throw new OrcCorruptionException("Unexpected end of stream"); - } + verifyFormat(result >= 0, "Unexpected end of stream"); offset += result; } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java index 2e8acf215a..e03dbbbae1 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java @@ -13,17 +13,17 @@ */ package org.apache.tajo.storage.thirdparty.orc.stream; +import com.google.common.collect.ImmutableMap; import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; import org.apache.tajo.storage.thirdparty.orc.StreamId; import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; -import com.google.common.collect.ImmutableMap; import javax.annotation.Nonnull; import java.util.Map; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; public class StreamSources { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreamSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreamSource.java deleted file mode 100644 index 45288e8387..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreamSource.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.base.MoreObjects; - -import javax.annotation.Nullable; -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkNotNull; - -public class ValueStreamSource> implements StreamSource -{ - private final S stream; - - public ValueStreamSource(S stream) - { - this.stream = checkNotNull(stream, "stream is null"); - } - - @Override - public Class getStreamType() - { - return (Class) stream.getClass(); - } - - @Nullable - @Override - public S openStream() - throws IOException - { - return stream; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("stream", stream) - .toString(); - } -} From 015e40d10d86c5d3b2c37ac3531e32945aa55f14 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 20 May 2015 12:04:11 +0900 Subject: [PATCH 046/141] HdfsOrcDataSource constructor is changed to receive double instead of DataSize --- .../tajo/storage/thirdparty/orc/HdfsOrcDataSource.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java index a373c27581..16414d2016 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java @@ -40,14 +40,15 @@ public class HdfsOrcDataSource private final DataSize maxMergeDistance; private long readTimeNanos; - public HdfsOrcDataSource(String path, FSDataInputStream inputStream, long size, DataSize maxMergeDistance) + public HdfsOrcDataSource(String path, FSDataInputStream inputStream, long size, double maxMergeDistance) { this.path = checkNotNull(path, "path is null"); this.inputStream = checkNotNull(inputStream, "inputStream is null"); this.size = size; checkArgument(size >= 0, "size is negative"); - this.maxMergeDistance = checkNotNull(maxMergeDistance, "maxMergeDistance is null"); + DataSize mergeDistance = new DataSize(maxMergeDistance, DataSize.Unit.BYTE); + this.maxMergeDistance = checkNotNull(mergeDistance, "maxMergeDistance is null"); } @Override From 4952195d4ea5dfcd2fa7c08c143477c3b446c13b Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 20 May 2015 14:34:34 +0900 Subject: [PATCH 047/141] Initial OrcScanner --- .../apache/tajo/storage/orc/OrcScanner.java | 167 +++++++++++++++++- 1 file changed, 165 insertions(+), 2 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 279f3c6a46..e161b1dc2e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -19,32 +19,195 @@ package org.apache.tajo.storage.orc; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.common.TajoDataTypes; +import org.apache.tajo.conf.TajoConf; +import org.apache.tajo.datum.*; +import org.apache.tajo.exception.UnsupportedException; import org.apache.tajo.storage.FileScanner; import org.apache.tajo.storage.Tuple; +import org.apache.tajo.storage.VTuple; import org.apache.tajo.storage.fragment.Fragment; +import org.apache.tajo.storage.thirdparty.orc.*; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnStatistics; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcMetadataReader; +import org.joda.time.DateTimeZone; import java.io.IOException; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; public class OrcScanner extends FileScanner { + private OrcRecordReader recordReader; + private Vector [] vectors; + private int currentPosInBatch = 0; + private int batchSize = 0; public OrcScanner(Configuration conf, final Schema schema, final TableMeta meta, final Fragment fragment) { super(conf, schema, meta, fragment); } + private Vector createOrcVector(TajoDataTypes.Type type) { + switch (type) { + case INT1: case INT2: case INT4: case INT8: + case UINT1: case UINT2: case UINT4: case UINT8: + return new LongVector(); + + case FLOAT4: + case FLOAT8: + return new DoubleVector(); + + case BOOLEAN: + return new BooleanVector(); + + case BLOB: + case TEXT: + return new SliceVector(); + + default: + throw new UnsupportedException("This data type is not supported currently: "+type.toString()); + } + } + + + private FileSystem fs; + private FSDataInputStream fis; + @Override public void init() throws IOException { + OrcReader orcReader; + if (targets == null) { targets = schema.toArray(); } super.init(); + + Path path = fragment.getPath(); + + // FileFragment information + if(fs == null) { + fs = FileScanner.getFileSystem((TajoConf)conf, path); + } + if(fis == null) fis = fs.open(path); + + OrcDataSource orcDataSource = new HdfsOrcDataSource( + this.fragment.getPath().toString(), + fis, + fs.getFileStatus(path).getLen(), + 200000000); + + for (int i=0; i columnSet = new HashSet(); + for (int i=0; i statisticsByColumnIndex) { + return true; + } + }, + 0, 1024, DateTimeZone.getDefault()); + + getNextBatch(); } @Override public Tuple next() throws IOException { - return null; + if (currentPosInBatch == batchSize) { + getNextBatch(); + + // EOF + if (batchSize == -1) { + return null; + } + } + + int columnSize = schema.size(); + Tuple tuple = new VTuple(columnSize); + + for (int i=0; i Date: Wed, 20 May 2015 15:05:52 +0900 Subject: [PATCH 048/141] Close code error fixed --- .../src/main/java/org/apache/tajo/storage/orc/OrcScanner.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index e161b1dc2e..e8a7dbf0eb 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -217,7 +217,9 @@ public void reset() throws IOException { @Override public void close() throws IOException { - recordReader.close(); + if (recordReader != null) { + recordReader.close(); + } } @Override From 5706b108d76fb7b055e6fe601eb9f85e56da75fe Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 20 May 2015 15:57:53 +0900 Subject: [PATCH 049/141] Creating vectors missed --- .../src/main/java/org/apache/tajo/storage/orc/OrcScanner.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index e8a7dbf0eb..9a521122e0 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -74,7 +74,6 @@ private Vector createOrcVector(TajoDataTypes.Type type) { } } - private FileSystem fs; private FSDataInputStream fis; @@ -102,6 +101,7 @@ public void init() throws IOException { fs.getFileStatus(path).getLen(), 200000000); + vectors = new Vector[schema.size()]; for (int i=0; i Date: Wed, 20 May 2015 17:37:51 +0900 Subject: [PATCH 050/141] Add comment --- .../apache/tajo/storage/orc/OrcScanner.java | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 9a521122e0..9b37255279 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -42,6 +42,9 @@ import java.util.Map; import java.util.Set; +/** + * OrcScanner for reading ORC files + */ public class OrcScanner extends FileScanner { private OrcRecordReader recordReader; private Vector [] vectors; @@ -89,23 +92,28 @@ public void init() throws IOException { Path path = fragment.getPath(); - // FileFragment information if(fs == null) { fs = FileScanner.getFileSystem((TajoConf)conf, path); } - if(fis == null) fis = fs.open(path); + if(fis == null) { + fis = fs.open(path); + } + + // TODO: max merge distance should be fetched from conf OrcDataSource orcDataSource = new HdfsOrcDataSource( this.fragment.getPath().toString(), fis, fs.getFileStatus(path).getLen(), 200000000); + // creating vectors for buffering vectors = new Vector[schema.size()]; for (int i=0; i columnSet = new HashSet(); for (int i=0; i statisticsByColumnIndex) { @@ -152,6 +158,7 @@ public Tuple next() throws IOException { return tuple; } + // TODO: support more types private Datum createValueDatum(Vector vector, TajoDataTypes.Type type) { switch (type) { case INT1: @@ -212,7 +219,6 @@ public float getProgress() { @Override public void reset() throws IOException { - } @Override From 6a4fd5e1127028da0d6deb45e546f34ffc453493 Mon Sep 17 00:00:00 2001 From: JaeHwa Jung Date: Thu, 21 May 2015 16:01:25 +0900 Subject: [PATCH 051/141] TAJO-1586: TajoMaster HA startup failure on Yarn. (missing changes) --- tajo-core/src/main/resources/webapps/admin/catalogview.jsp | 1 + 1 file changed, 1 insertion(+) diff --git a/tajo-core/src/main/resources/webapps/admin/catalogview.jsp b/tajo-core/src/main/resources/webapps/admin/catalogview.jsp index 3455d0b35e..e9f93fe133 100644 --- a/tajo-core/src/main/resources/webapps/admin/catalogview.jsp +++ b/tajo-core/src/main/resources/webapps/admin/catalogview.jsp @@ -30,6 +30,7 @@ <%@ page import="java.util.Collection" %> <%@ page import="java.util.List" %> <%@ page import="java.util.Map" %> +<%@ page import="java.net.InetSocketAddress" %> <% TajoMaster master = (TajoMaster) StaticHttpServer.getInstance().getAttribute("tajo.info.server.object"); From 45641e04f708588fbc45bffd40b90c4d1b5266a8 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 21 May 2015 16:21:02 +0900 Subject: [PATCH 052/141] FileOrcDataSource constructor modified --- .../apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java index 3d0c42eb89..6b04204668 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java @@ -37,14 +37,14 @@ public class FileOrcDataSource private final DataSize maxMergeDistance; private long readTimeNanos; - public FileOrcDataSource(File path, DataSize maxMergeDistance) + public FileOrcDataSource(File path, double mergeDistance) throws IOException { this.path = checkNotNull(path, "path is null"); this.size = path.length(); this.input = new RandomAccessFile(path, "r"); - this.maxMergeDistance = checkNotNull(maxMergeDistance, "maxMergeDistance is null"); + maxMergeDistance = new DataSize(mergeDistance, DataSize.Unit.BYTE); } @Override From e7cd698f0cc9094c1ec2d1f3d7b7fb34102bf6b6 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 21 May 2015 16:21:36 +0900 Subject: [PATCH 053/141] Supporting timestamp --- .../main/java/org/apache/tajo/storage/orc/OrcScanner.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 9b37255279..75add87a73 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -35,6 +35,7 @@ import org.apache.tajo.storage.thirdparty.orc.*; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnStatistics; import org.apache.tajo.storage.thirdparty.orc.metadata.OrcMetadataReader; +import org.apache.tajo.util.datetime.DateTimeUtil; import org.joda.time.DateTimeZone; import java.io.IOException; @@ -59,6 +60,7 @@ private Vector createOrcVector(TajoDataTypes.Type type) { switch (type) { case INT1: case INT2: case INT4: case INT8: case UINT1: case UINT2: case UINT4: case UINT8: + case TIMESTAMP: return new LongVector(); case FLOAT4: @@ -190,6 +192,9 @@ private Datum createValueDatum(Vector vector, TajoDataTypes.Type type) { case BLOB: return new BlobDatum(((SliceVector)vector).vector[currentPosInBatch].getBytes()); + case TIMESTAMP: + return new TimestampDatum(DateTimeUtil.javaTimeToJulianTime(((LongVector) vector).vector[currentPosInBatch])); + default: throw new UnsupportedException("This data type is not supported currently: "+type.toString()); } From e839228bfb49454d3a6d5b3b22550c14b96e42b9 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 21 May 2015 16:36:53 +0900 Subject: [PATCH 054/141] OrcScaner test added --- .../src/test/resources/storage-default.xml | 11 +- .../tajo/storage/orc/TestOrcScanner.java | 107 ++++++++++++++++++ .../src/test/resources/dataset/u_data_20.orc | Bin 0 -> 813 bytes 3 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/u_data_20.orc diff --git a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml index 6aa32fc0da..41804b3ee7 100644 --- a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml @@ -38,7 +38,7 @@ tajo.storage.scanner-handler - csv,raw,rcfile,row,trevni,parquet,sequencefile,avro + csv,raw,rcfile,row,trevni,parquet,orc,sequencefile,avro @@ -66,6 +66,10 @@ tajo.storage.fragment.parquet.class org.apache.tajo.storage.FileFragment + + tajo.storage.fragment.orc.class + org.apache.tajo.storage.FileFragment + tajo.storage.fragment.sequencefile.class org.apache.tajo.storage.fragment.FileFragment @@ -106,6 +110,11 @@ org.apache.tajo.storage.parquet.ParquetScanner + + tajo.storage.scanner-handler.orc.class + org.apache.tajo.storage.orc.OrcScanner + + tajo.storage.scanner-handler.sequencefile.class org.apache.tajo.storage.sequencefile.SequenceFileScanner diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java new file mode 100644 index 0000000000..8b60b9c2c7 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java @@ -0,0 +1,107 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.common.TajoDataTypes; +import org.apache.tajo.conf.TajoConf; +import org.apache.tajo.datum.TimestampDatum; +import org.apache.tajo.storage.Tuple; +import org.apache.tajo.storage.fragment.FileFragment; +import org.apache.tajo.storage.fragment.Fragment; +import org.apache.tajo.util.KeyValueSet; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.net.URL; + +public class TestOrcScanner { + private OrcScanner orcScanner; + + public static Path getResourcePath(String path, String suffix) { + URL resultBaseURL = ClassLoader.getSystemResource(path); + return new Path(resultBaseURL.toString(), suffix); + } + + private static FileFragment getFileFragment(Configuration conf, String fileName) throws IOException { + Path tablePath = new Path(getResourcePath("dataset", "."), fileName); + FileSystem fs = FileSystem.getLocal(conf); + FileStatus status = fs.getFileStatus(tablePath); + return new FileFragment("table", tablePath, 0, status.getLen()); + } + + @Before + public void setup() throws IOException { + Schema schema = new Schema(); + schema.addColumn("userid", TajoDataTypes.Type.INT4); + schema.addColumn("movieid", TajoDataTypes.Type.INT4); + schema.addColumn("rating", TajoDataTypes.Type.INT2); + schema.addColumn("unixtimestamp", TajoDataTypes.Type.TEXT); + schema.addColumn("faketime", TajoDataTypes.Type.TIMESTAMP); + + Configuration conf = new TajoConf(); + + TableMeta meta = new TableMeta("ORC", new KeyValueSet()); + + Fragment fragment = getFileFragment(conf, "u_data_20.orc"); + + orcScanner = new OrcScanner(conf, schema, meta, fragment); + + orcScanner.init(); + } + + @Test + public void testReadTuple() { + try { + Tuple tuple = orcScanner.next(); + + assertEquals(tuple.getInt4(0), 196); + assertEquals(tuple.getInt4(1), 242); + assertEquals(tuple.getInt2(2), 3); + assertEquals(tuple.getText(3), "881250949"); + + // Timestamp test + TimestampDatum timestamp = (TimestampDatum)tuple.get(4); + + assertEquals(timestamp.getYear(), 2008); + assertEquals(timestamp.getMonthOfYear(), 12); + assertEquals(timestamp.getDayOfMonth(), 12); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @After + public void end() { + try { + orcScanner.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} \ No newline at end of file diff --git a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/u_data_20.orc b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/u_data_20.orc new file mode 100644 index 0000000000000000000000000000000000000000..e6e9c49e0a8747934f03c2220c0ed8cf17e40169 GIT binary patch literal 813 zcmeYdau#G@;9?VE;SdR6&<8R_xtJLk7=-vZM1(jvcmy7?NX)VgU@!#Ah(Todfij#N zRssivBu+Il1~6y?rGz0;JU}Tn4kiIEiK+mGAO?oVmmVagGB7Gu@(HSX>iKaO3v(w6 zdkYII`f+3aal(upj3o>@j4v2{m_ityG5%soV&Gw@VX|SIz_^6*08<>p7seJQH^wUv#ZQ=G zm_9M2F>PVq!X(7Jh`Ehn4HFal8^%2h0Ss1bO3Z9bDjX`D4)P2PZair!8eVB$3mDkk zn3uF1VPFvLJt=s=f#*=vr-M>2_HUdppN(7j>mDN=^Vvr_Cf%qFik+LjH2Sk8XVm0~ zS2sFd>=V?Rd%MwULwNMwMbn#I8$Leo44>h;H1zkI|Bm7xB-pENM}FJzL8c9D7THEfX#9>@-%W;h7 zSYYRwzc!y4xIudMDl>F3FvOlR*cNhgZ&;DKPp5hcUZVke%nf3mb(!@|emIVUTHh3f+Y>kW$;EIU+IHQ3;fO$Syb8Kt-y zna^r|^NOy-CfqyvO?kExlLUhgBg5kau^-2EjvhX!v!N$&(R9y)&dx5L?al5TQzv%L zYz^%)wX?K8&fp~Jq@m1_%CpdC$_$Gc6Ai2dilsUtHWqZS7({$5?BFOY_`qS4G;u~M z_mj`eKj+AVF|oI(_?a3pu~sxN<;eIj#R!Eu%(H3ln4}`Uyme=1>&DKdot*;9TbE93 w+_`k(#+^sF)fO;ePiK6>e;7DcG72zCG&Be(F){FH?OaK4? literal 0 HcmV?d00001 From 28d7f1f54724d8f517b660e9833b91b241fca460 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 20 May 2015 14:52:22 +0900 Subject: [PATCH 055/141] Added orc row in storage-default.xml --- .../src/main/resources/storage-default.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml index 93611fbd70..8bdd36ff10 100644 --- a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml @@ -120,6 +120,11 @@ org.apache.tajo.storage.parquet.ParquetScanner + + tajo.storage.scanner-handler.orc.class + org.apache.tajo.storage.orc.OrcScanner + + tajo.storage.scanner-handler.sequencefile.class org.apache.tajo.storage.sequencefile.SequenceFileScanner From 59970024de3ad02ef2ca24945f48bb39d89872ae Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sun, 29 Mar 2015 16:14:11 +0900 Subject: [PATCH 056/141] TAJO-1463: Add ORCFile store type to create ORCFile table --- .../src/main/java/org/apache/tajo/catalog/CatalogUtil.java | 2 ++ .../tajo-catalog-common/src/main/proto/CatalogProtos.proto | 1 + .../src/main/java/org/apache/tajo/storage/Tablespace.java | 3 ++- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java index a2e4a9d303..0d96755ebe 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java +++ b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java @@ -288,6 +288,8 @@ public static StoreType getStoreType(final String typeStr) { return StoreType.ROWFILE; } else if (typeStr.equalsIgnoreCase(StoreType.RCFILE.name())) { return StoreType.RCFILE; + } else if (typeStr.equalsIgnoreCase(StoreType.ORCFILE.name())) { + return StoreType.ORCFILE; } else if (typeStr.equalsIgnoreCase(StoreType.PARQUET.name())) { return StoreType.PARQUET; } else if (typeStr.equalsIgnoreCase(StoreType.SEQUENCEFILE.name())) { diff --git a/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto b/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto index b213916f30..4eb4af4735 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto +++ b/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto @@ -32,6 +32,7 @@ enum StoreType { RCFILE = 3; ROWFILE = 4; HCFILE = 5; + ORCFILE = 6; PARQUET = 7; SEQUENCEFILE = 8; AVRO = 9; diff --git a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/Tablespace.java b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/Tablespace.java index 0626da8cf5..7c026f5ab5 100644 --- a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/Tablespace.java +++ b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/Tablespace.java @@ -31,6 +31,7 @@ import org.apache.tajo.catalog.proto.CatalogProtos.FragmentProto; import org.apache.tajo.conf.TajoConf; import org.apache.tajo.conf.TajoConf.ConfVars; +import org.apache.tajo.exception.UnsupportedException; import org.apache.tajo.plan.LogicalPlan; import org.apache.tajo.plan.logical.LogicalNode; import org.apache.tajo.plan.logical.ScanNode; @@ -296,7 +297,7 @@ public Class getScannerClass(String storeType) throws IOExcep } if (scannerClass == null) { - throw new IOException("Unknown Storage Type: " + storeType); + throw new UnsupportedException("Unsupported Storage Type: " + storeType.name()); } return scannerClass; From e01d45e6ac6b71dc9b11eaaaa02126f5f52ebd38 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 21 May 2015 16:46:20 +0900 Subject: [PATCH 057/141] compile error fixed --- .../src/main/java/org/apache/tajo/storage/Tablespace.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/Tablespace.java b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/Tablespace.java index 7c026f5ab5..e8396702a6 100644 --- a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/Tablespace.java +++ b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/Tablespace.java @@ -297,7 +297,7 @@ public Class getScannerClass(String storeType) throws IOExcep } if (scannerClass == null) { - throw new UnsupportedException("Unsupported Storage Type: " + storeType.name()); + throw new UnsupportedException("Unsupported Storage Type: " + storeType); } return scannerClass; From ae613a704f6a216899144a73a4e09bdab9b3de77 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 21 May 2015 17:10:12 +0900 Subject: [PATCH 058/141] TimestampDatum comment fixed --- .../src/main/java/org/apache/tajo/datum/TimestampDatum.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java index ad73c749ba..de9c4dca37 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java @@ -36,7 +36,7 @@ public class TimestampDatum extends Datum { /** * - * @param timestamp UTC based + * @param timestamp UTC based Julian time microseconds */ public TimestampDatum(long timestamp) { super(TajoDataTypes.Type.TIMESTAMP); From 904461443e02ae71dccc50bb1ef613e37914f3e0 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 21 May 2015 18:23:59 +0900 Subject: [PATCH 059/141] Imported presto-orc maven jar and removed most of classes from Presto --- tajo-storage/tajo-storage-hdfs/pom.xml | 5 + .../apache/tajo/storage/orc/OrcScanner.java | 9 +- .../storage/thirdparty/orc/BooleanVector.java | 36 -- .../storage/thirdparty/orc/DiskRange.java | 77 --- .../storage/thirdparty/orc/DoubleVector.java | 36 -- .../thirdparty/orc/FileOrcDataSource.java | 11 +- .../thirdparty/orc/HdfsOrcDataSource.java | 13 +- .../storage/thirdparty/orc/LongVector.java | 36 -- .../storage/thirdparty/orc/ObjectVector.java | 29 -- .../orc/OrcCorruptionException.java | 43 -- .../storage/thirdparty/orc/OrcDataSource.java | 37 -- .../thirdparty/orc/OrcDataSourceUtils.java | 82 ---- .../storage/thirdparty/orc/OrcPredicate.java | 40 -- .../storage/thirdparty/orc/OrcReader.java | 219 --------- .../thirdparty/orc/OrcRecordReader.java | 321 ------------- .../tajo/storage/thirdparty/orc/RowGroup.java | 58 --- .../storage/thirdparty/orc/SliceVector.java | 36 -- .../thirdparty/orc/StreamDescriptor.java | 83 ---- .../tajo/storage/thirdparty/orc/StreamId.java | 77 --- .../tajo/storage/thirdparty/orc/Stripe.java | 70 --- .../storage/thirdparty/orc/StripeReader.java | 352 -------------- .../tajo/storage/thirdparty/orc/Vector.java | 24 - .../checkpoint/BooleanStreamCheckpoint.java | 58 --- .../checkpoint/ByteArrayStreamCheckpoint.java | 50 -- .../orc/checkpoint/ByteStreamCheckpoint.java | 60 --- .../orc/checkpoint/Checkpoints.java | 405 ---------------- .../checkpoint/DoubleStreamCheckpoint.java | 50 -- .../orc/checkpoint/FloatStreamCheckpoint.java | 50 -- .../orc/checkpoint/InputStreamCheckpoint.java | 64 --- .../orc/checkpoint/LongStreamCheckpoint.java | 19 - .../checkpoint/LongStreamDwrfCheckpoint.java | 50 -- .../checkpoint/LongStreamV1Checkpoint.java | 60 --- .../checkpoint/LongStreamV2Checkpoint.java | 60 --- ...GroupDictionaryLengthStreamCheckpoint.java | 53 -- .../orc/checkpoint/StreamCheckpoint.java | 18 - .../orc/json/BooleanJsonReader.java | 117 ----- .../thirdparty/orc/json/ByteJsonReader.java | 118 ----- .../thirdparty/orc/json/DateJsonReader.java | 123 ----- .../thirdparty/orc/json/DoubleJsonReader.java | 120 ----- .../thirdparty/orc/json/FloatJsonReader.java | 122 ----- .../thirdparty/orc/json/JsonMapKeyReader.java | 23 - .../thirdparty/orc/json/JsonReader.java | 36 -- .../thirdparty/orc/json/JsonReaders.java | 100 ---- .../thirdparty/orc/json/ListJsonReader.java | 125 ----- .../orc/json/LongDictionaryJsonReader.java | 142 ------ .../orc/json/LongDirectJsonReader.java | 112 ----- .../thirdparty/orc/json/LongJsonReader.java | 99 ---- .../thirdparty/orc/json/MapJsonReader.java | 138 ------ .../orc/json/SliceDictionaryJsonReader.java | 269 ----------- .../orc/json/SliceDirectJsonReader.java | 168 ------- .../thirdparty/orc/json/SliceJsonReader.java | 98 ---- .../thirdparty/orc/json/StructJsonReader.java | 117 ----- .../orc/json/TimestampJsonReader.java | 134 ------ .../orc/metadata/BooleanStatistics.java | 29 -- .../orc/metadata/ColumnEncoding.java | 57 --- .../orc/metadata/ColumnStatistics.java | 74 --- .../orc/metadata/CompressionKind.java | 19 - .../orc/metadata/DateStatistics.java | 39 -- .../orc/metadata/DoubleStatistics.java | 39 -- .../orc/metadata/DwrfMetadataReader.java | 367 -------------- .../thirdparty/orc/metadata/Footer.java | 76 --- .../orc/metadata/IntegerStatistics.java | 37 -- .../thirdparty/orc/metadata/Metadata.java | 31 -- .../orc/metadata/MetadataReader.java | 36 -- .../orc/metadata/OrcMetadataReader.java | 402 ---------------- .../thirdparty/orc/metadata/OrcType.java | 105 ---- .../thirdparty/orc/metadata/PostScript.java | 76 --- .../orc/metadata/RangeStatistics.java | 20 - .../orc/metadata/RowGroupIndex.java | 42 -- .../thirdparty/orc/metadata/Stream.java | 78 --- .../orc/metadata/StringStatistics.java | 39 -- .../thirdparty/orc/metadata/StripeFooter.java | 42 -- .../orc/metadata/StripeInformation.java | 71 --- .../orc/metadata/StripeStatistics.java | 35 -- .../orc/reader/BooleanStreamReader.java | 153 ------ .../orc/reader/ByteStreamReader.java | 155 ------ .../orc/reader/DoubleStreamReader.java | 155 ------ .../orc/reader/FloatStreamReader.java | 156 ------ .../orc/reader/JsonStreamReader.java | 180 ------- .../reader/LongDictionaryStreamReader.java | 210 -------- .../orc/reader/LongDirectStreamReader.java | 155 ------ .../orc/reader/LongStreamReader.java | 88 ---- .../reader/SliceDictionaryStreamReader.java | 287 ----------- .../orc/reader/SliceDirectStreamReader.java | 198 -------- .../orc/reader/SliceStreamReader.java | 88 ---- .../thirdparty/orc/reader/StreamReader.java | 34 -- .../thirdparty/orc/reader/StreamReaders.java | 58 --- .../orc/reader/TimestampStreamReader.java | 217 --------- .../thirdparty/orc/stream/BooleanStream.java | 211 -------- .../orc/stream/ByteArrayStream.java | 67 --- .../thirdparty/orc/stream/ByteStream.java | 134 ------ .../orc/stream/CheckpointStreamSource.java | 69 --- .../thirdparty/orc/stream/DoubleStream.java | 104 ---- .../thirdparty/orc/stream/FloatStream.java | 109 ----- .../thirdparty/orc/stream/LongDecode.java | 177 ------- .../thirdparty/orc/stream/LongStream.java | 40 -- .../thirdparty/orc/stream/LongStreamDwrf.java | 129 ----- .../thirdparty/orc/stream/LongStreamV1.java | 184 ------- .../thirdparty/orc/stream/LongStreamV2.java | 452 ------------------ .../orc/stream/MissingStreamSource.java | 46 -- .../thirdparty/orc/stream/OrcInputStream.java | 274 ----------- .../thirdparty/orc/stream/OrcStreamUtils.java | 61 --- .../RowGroupDictionaryLengthStream.java | 52 -- .../thirdparty/orc/stream/StreamSource.java | 26 - .../thirdparty/orc/stream/StreamSources.java | 56 --- .../thirdparty/orc/stream/ValueStream.java | 29 -- .../thirdparty/orc/stream/ValueStreams.java | 146 ------ 107 files changed, 28 insertions(+), 11038 deletions(-) delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcPredicate.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RowGroup.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Vector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/StreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/BooleanStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnEncoding.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/CompressionKind.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DateStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DoubleStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Footer.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/IntegerStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Metadata.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/MetadataReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcType.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/PostScript.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RangeStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RowGroupIndex.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Stream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeFooter.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeInformation.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/BooleanStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/MissingStreamSource.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/RowGroupDictionaryLengthStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSource.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreams.java diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index efe7b210e3..d6e1cb728b 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -394,6 +394,11 @@ jackson-core 2.4.2 + + com.facebook.presto + presto-orc + 0.86 + com.facebook.hive hive-dwrf diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 75add87a73..d72c968fc1 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -32,9 +32,10 @@ import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.VTuple; import org.apache.tajo.storage.fragment.Fragment; -import org.apache.tajo.storage.thirdparty.orc.*; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnStatistics; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcMetadataReader; +import com.facebook.presto.orc.*; +import com.facebook.presto.orc.metadata.ColumnStatistics; +import com.facebook.presto.orc.metadata.OrcMetadataReader; +import org.apache.tajo.storage.thirdparty.orc.HdfsOrcDataSource; import org.apache.tajo.util.datetime.DateTimeUtil; import org.joda.time.DateTimeZone; @@ -107,7 +108,7 @@ public void init() throws IOException { this.fragment.getPath().toString(), fis, fs.getFileStatus(path).getLen(), - 200000000); + 100000000); // creating vectors for buffering vectors = new Vector[schema.size()]; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java deleted file mode 100644 index aaa1ada35c..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.annotations.VisibleForTesting; - -public class BooleanVector - implements Vector -{ - public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; - public final boolean[] vector = new boolean[MAX_VECTOR_LENGTH]; - - @Override - @VisibleForTesting - public ObjectVector toObjectVector(int size) - { - ObjectVector objectVector = new ObjectVector(); - for (int i = 0; i < size; i++) { - if (!isNull[i]) { - objectVector.vector[i] = vector[i]; - } - } - return objectVector; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java deleted file mode 100644 index 8a3f249c3f..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.primitives.Ints; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; - -public final class DiskRange -{ - private final long offset; - private final int length; - - public DiskRange(long offset, int length) - { - checkArgument(offset >= 0, "offset is negative"); - checkArgument(length >= 0, "length is negative"); - - this.offset = offset; - this.length = length; - } - - public long getOffset() - { - return offset; - } - - public int getLength() - { - return length; - } - - public long getEnd() - { - return offset + length; - } - - public boolean contains(DiskRange diskRange) - { - return offset <= diskRange.getOffset() && diskRange.getEnd() <= getEnd(); - } - - /** - * Returns the minimal DiskRange that encloses both this DiskRange - * and otherDiskRange. If there was a gap between the ranges the - * new range will cover that gap. - */ - public DiskRange span(DiskRange otherDiskRange) - { - checkNotNull(otherDiskRange, "otherDiskRange is null"); - long start = Math.min(this.offset, otherDiskRange.getOffset()); - long end = Math.max(getEnd(), otherDiskRange.getEnd()); - return new DiskRange(start, Ints.checkedCast(end - start)); - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("offset", offset) - .add("length", length) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java deleted file mode 100644 index 8f20d29590..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.annotations.VisibleForTesting; - -public class DoubleVector - implements Vector -{ - public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; - public final double[] vector = new double[MAX_VECTOR_LENGTH]; - - @Override - @VisibleForTesting - public ObjectVector toObjectVector(int size) - { - ObjectVector objectVector = new ObjectVector(); - for (int i = 0; i < size; i++) { - if (!isNull[i]) { - objectVector.vector[i] = vector[i]; - } - } - return objectVector; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java index 6b04204668..dcc134705b 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java @@ -13,6 +13,8 @@ */ package org.apache.tajo.storage.thirdparty.orc; +import com.facebook.presto.orc.DiskRange; +import com.facebook.presto.orc.OrcDataSource; import com.google.common.collect.ImmutableMap; import io.airlift.slice.Slice; import io.airlift.units.DataSize; @@ -25,9 +27,14 @@ import java.util.Map.Entry; import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.getDiskRangeSlice; -import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; +import static com.facebook.presto.orc.OrcDataSourceUtils.getDiskRangeSlice; +import static com.facebook.presto.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; +/** + * File data source class for Orc Reader + * + * Most of code is from Presto + */ public class FileOrcDataSource implements OrcDataSource { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java index 16414d2016..73ea47538d 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java @@ -14,8 +14,8 @@ */ package org.apache.tajo.storage.thirdparty.orc; -import org.apache.tajo.storage.thirdparty.orc.DiskRange; -import org.apache.tajo.storage.thirdparty.orc.OrcDataSource; +import com.facebook.presto.orc.DiskRange; +import com.facebook.presto.orc.OrcDataSource; import com.google.common.collect.ImmutableMap; import io.airlift.slice.Slice; import io.airlift.units.DataSize; @@ -26,11 +26,16 @@ import java.util.Map; import java.util.Map.Entry; -import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.getDiskRangeSlice; -import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; +import static com.facebook.presto.orc.OrcDataSourceUtils.getDiskRangeSlice; +import static com.facebook.presto.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; +/** + * HDFS File data source class for Orc Reader + * + * Most of code is from Presto + */ public class HdfsOrcDataSource implements OrcDataSource { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java deleted file mode 100644 index 7c9407a3e6..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.annotations.VisibleForTesting; - -public class LongVector - implements Vector -{ - public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; - public final long[] vector = new long[MAX_VECTOR_LENGTH]; - - @Override - @VisibleForTesting - public ObjectVector toObjectVector(int size) - { - ObjectVector objectVector = new ObjectVector(); - for (int i = 0; i < size; i++) { - if (!isNull[i]) { - objectVector.vector[i] = vector[i]; - } - } - return objectVector; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java deleted file mode 100644 index 19f9608f7d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.annotations.VisibleForTesting; - -public class ObjectVector - implements Vector -{ - public final Object[] vector = new Object[MAX_VECTOR_LENGTH]; - - @Override - @VisibleForTesting - public ObjectVector toObjectVector(int size) - { - return this; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java deleted file mode 100644 index c780bcb51f..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.jetbrains.annotations.Contract; - -import java.io.IOException; - -import static java.lang.String.format; - -public class OrcCorruptionException - extends IOException -{ - @Contract("false, _, _ -> fail") - public static void verifyFormat(boolean test, String messageFormat, Object... args) - throws OrcCorruptionException - { - if (!test) { - throw new OrcCorruptionException(messageFormat, args); - } - } - - public OrcCorruptionException(String messageFormat, Object... args) - { - super(format(messageFormat, args)); - } - - public OrcCorruptionException(Throwable cause, String messageFormat, Object... args) - { - super(format(messageFormat, args), cause); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java deleted file mode 100644 index 8eb1cbdd00..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import io.airlift.slice.Slice; - -import java.io.Closeable; -import java.io.IOException; -import java.util.Map; - -public interface OrcDataSource - extends Closeable -{ - long getReadTimeNanos(); - - long getSize(); - - void readFully(long position, byte[] buffer) - throws IOException; - - void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) - throws IOException; - - Map readFully(Map diskRanges) - throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java deleted file mode 100644 index ba65c3c55c..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.primitives.Ints; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; -import io.airlift.units.DataSize; - -import java.util.*; -import java.util.Map.Entry; - -import static com.google.common.collect.Lists.newArrayList; - -public final class OrcDataSourceUtils -{ - private OrcDataSourceUtils() - { - } - - /** - * Merge disk ranges that are closer than {@code maxMergeDistance}. - */ - public static Iterable mergeAdjacentDiskRanges(Iterable diskRanges, DataSize maxMergeDistance) - { - // sort ranges by start offset - List ranges = newArrayList(diskRanges); - Collections.sort(ranges, new Comparator() { - @Override - public int compare(DiskRange o1, DiskRange o2) { - return Long.compare(o1.getOffset(), o2.getOffset()); - } - }); - - // merge overlapping ranges - long maxMergeDistanceBytes = maxMergeDistance.toBytes(); - List result = new ArrayList(); - DiskRange last = ranges.get(0); - for (int i = 1; i < ranges.size(); i++) { - DiskRange current = ranges.get(i); - if (last.getEnd() + maxMergeDistanceBytes + 1 >= current.getOffset()) { - last = last.span(current); - } - else { - result.add(last); - last = current; - } - } - result.add(last); - - return result; - } - - /** - * Get a slice for the disk range from the provided buffers. The buffers ranges do not have - * to exactly match {@code diskRange}, but {@code diskRange} must be completely contained within - * one of the buffer ranges. - */ - public static Slice getDiskRangeSlice(DiskRange diskRange, Map buffers) - { - for (Entry bufferEntry : buffers.entrySet()) { - DiskRange bufferRange = bufferEntry.getKey(); - byte[] buffer = bufferEntry.getValue(); - if (bufferRange.contains(diskRange)) { - int offset = Ints.checkedCast(diskRange.getOffset() - bufferRange.getOffset()); - return Slices.wrappedBuffer(buffer, offset, diskRange.getLength()); - } - } - throw new IllegalStateException("No matching buffer for disk range"); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcPredicate.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcPredicate.java deleted file mode 100644 index b071056f58..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcPredicate.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnStatistics; - -import java.util.Map; - -public interface OrcPredicate -{ - OrcPredicate TRUE = new OrcPredicate() - { - @Override - public boolean matches(long numberOfRows, Map statisticsByColumnIndex) - { - return true; - } - }; - - /** - * Should the ORC reader process a file section with the specified statistics. - * - * @param numberOfRows the number of rows in the segment; this can be used with - * {@code ColumnStatistics} to determine if a column is only null - * @param statisticsByColumnIndex statistics for column by ordinal position - * in the file; this will match the field order from the hive metastore - */ - boolean matches(long numberOfRows, Map statisticsByColumnIndex); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java deleted file mode 100644 index 144baa5e7b..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.base.Joiner; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.metadata.*; -import org.apache.tajo.storage.thirdparty.orc.stream.OrcInputStream; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; -import org.joda.time.DateTimeZone; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.Set; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; -import static io.airlift.slice.SizeOf.SIZE_OF_BYTE; - -public class OrcReader -{ - private static final Slice MAGIC = Slices.utf8Slice("ORC"); - private static final int CURRENT_MAJOR_VERSION = 0; - private static final int CURRENT_MINOR_VERSION = 12; - private static final int EXPECTED_FOOTER_SIZE = 16 * 1024; - - private final OrcDataSource orcDataSource; - private final MetadataReader metadataReader; - private final CompressionKind compressionKind; - private final int bufferSize; - private final Footer footer; - private final Metadata metadata; - - // This is based on the Apache Hive ORC code - public OrcReader(OrcDataSource orcDataSource, MetadataReader metadataReader) - throws IOException - { - this.orcDataSource = checkNotNull(orcDataSource, "orcDataSource is null"); - this.metadataReader = checkNotNull(metadataReader, "metadataReader is null"); - - // - // Read the file tail: - // - // variable: Footer - // variable: Metadata - // variable: PostScript - contains length of footer and metadata - // 3 bytes: file magic "ORC" - // 1 byte: postScriptSize = PostScript + Magic - - // figure out the size of the file using the option or filesystem - long size = orcDataSource.getSize(); - - // Read the tail of the file - byte[] buffer = new byte[(int) Math.min(size, EXPECTED_FOOTER_SIZE)]; - orcDataSource.readFully(size - buffer.length, buffer); - - // get length of PostScript - last byte of the file - int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff; - - // make sure this is an ORC file and not an RCFile or something else - verifyOrcFooter(orcDataSource, postScriptSize, buffer); - - // decode the post script - int postScriptOffset = buffer.length - SIZE_OF_BYTE - postScriptSize; - PostScript postScript = metadataReader.readPostScript(buffer, postScriptOffset, postScriptSize); - - // verify this is a supported version - checkOrcVersion(orcDataSource, postScript.getVersion()); - - // check compression codec is supported - this.compressionKind = postScript.getCompression(); - - this.bufferSize = Ints.checkedCast(postScript.getCompressionBlockSize()); - - int footerSize = Ints.checkedCast(postScript.getFooterLength()); - int metadataSize = Ints.checkedCast(postScript.getMetadataLength()); - - // check if extra bytes need to be read - Slice completeFooterSlice; - int completeFooterSize = footerSize + metadataSize + postScriptSize + SIZE_OF_BYTE; - if (completeFooterSize > buffer.length) { - // allocate a new buffer large enough for the complete footer - byte[] newBuffer = new byte[completeFooterSize]; - completeFooterSlice = Slices.wrappedBuffer(newBuffer); - - // initial read was not large enough, so read missing section - orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length); - - // copy already read bytes into the new buffer - completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer); - } - else { - // footer is already in the bytes in buffer, just adjust position, length - completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize); - } - - // read metadata - Slice metadataSlice = completeFooterSlice.slice(0, metadataSize); - InputStream metadataInputStream = new OrcInputStream(orcDataSource.toString(), metadataSlice.getInput(), compressionKind, bufferSize); - this.metadata = metadataReader.readMetadata(metadataInputStream); - - // read footer - Slice footerSlice = completeFooterSlice.slice(metadataSize, footerSize); - InputStream footerInputStream = new OrcInputStream(orcDataSource.toString(), footerSlice.getInput(), compressionKind, bufferSize); - this.footer = metadataReader.readFooter(footerInputStream); - } - - public List getColumnNames() - { - return footer.getTypes().get(0).getFieldNames(); - } - - public Footer getFooter() - { - return footer; - } - - public Metadata getMetadata() - { - return metadata; - } - - public CompressionKind getCompressionKind() - { - return compressionKind; - } - - public int getBufferSize() - { - return bufferSize; - } - - public OrcRecordReader createRecordReader( - Set includedColumns, - OrcPredicate predicate, - long offset, - long length, - DateTimeZone hiveStorageTimeZone) - throws IOException - { - return new OrcRecordReader( - checkNotNull(includedColumns, "includedColumns is null"), - checkNotNull(predicate, "predicate is null"), - footer.getNumberOfRows(), - footer.getStripes(), - footer.getFileStats(), - metadata.getStripeStatsList(), - orcDataSource, - offset, - length, - footer.getTypes(), - compressionKind, - bufferSize, - footer.getRowsInRowGroup(), - checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"), - metadataReader); - } - - /** - * Verify this is an ORC file to prevent users from trying to read text - * files or RC files as ORC files. - */ - // This is based on the Apache Hive ORC code - private static void verifyOrcFooter( - OrcDataSource source, - int postScriptSize, - byte[] buffer) - throws IOException - { - int magicLength = MAGIC.length(); - checkArgument(postScriptSize >= magicLength + 1, "Malformed ORC file %s. Invalid postscript length %s", source, postScriptSize); - - if (!MAGIC.equals(Slices.wrappedBuffer(buffer, buffer.length - 1 - magicLength, magicLength))) { - // Old versions of ORC (0.11) wrote the magic to the head of the file - byte[] headerMagic = new byte[magicLength]; - source.readFully(0, headerMagic); - - // if it isn't there, this isn't an ORC file - checkArgument(MAGIC.equals(Slices.wrappedBuffer(headerMagic)), "Malformed ORC file %s. Invalid postscript.", source); - } - } - - /** - * Check to see if this ORC file is from a future version and if so, - * warn the user that we may not be able to read all of the column encodings. - */ - // This is based on the Apache Hive ORC code - private static void checkOrcVersion(OrcDataSource orcDataSource, List version) - { - if (version.size() >= 1) { - int major = version.get(0); - int minor = 0; - if (version.size() > 1) { - minor = version.get(1); - } - - if (major > CURRENT_MAJOR_VERSION || (major == CURRENT_MAJOR_VERSION && minor > CURRENT_MINOR_VERSION)) { - System.err.println(String.format("ORC file %s was written by a newer Hive version %s. This file may not be readable by this version of Hive (%s.%s).", - orcDataSource, - Joiner.on('.').join(version), - CURRENT_MAJOR_VERSION, - CURRENT_MINOR_VERSION)); - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java deleted file mode 100644 index 9f0e78300d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java +++ /dev/null @@ -1,321 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.metadata.*; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.reader.StreamReader; -import org.apache.tajo.storage.thirdparty.orc.reader.StreamReaders; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTimeZone; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; - -public class OrcRecordReader -{ - private final OrcDataSource orcDataSource; - - private final StreamReader[] streamReaders; - - private final long totalRowCount; - private final long splitLength; - private final Set presentColumns; - private long currentPosition; - - private final List stripes; - private final StripeReader stripeReader; - private int currentStripe = -1; - - private Iterator rowGroups = ImmutableList.of().iterator(); - private long currentGroupRowCount; - private long nextRowInGroup; - - public OrcRecordReader( - Set includedColumns, - OrcPredicate predicate, - long numberOfRows, - List fileStripes, - List fileStats, - List stripeStats, - OrcDataSource orcDataSource, - long splitOffset, - long splitLength, - List types, - CompressionKind compressionKind, - int bufferSize, - int rowsInRowGroup, - DateTimeZone hiveStorageTimeZone, - MetadataReader metadataReader) - throws IOException - { - checkNotNull(includedColumns, "includedColumns is null"); - checkNotNull(predicate, "predicate is null"); - checkNotNull(fileStripes, "fileStripes is null"); - checkNotNull(stripeStats, "stripeStats is null"); - checkNotNull(orcDataSource, "orcDataSource is null"); - checkNotNull(types, "types is null"); - checkNotNull(compressionKind, "compressionKind is null"); - checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"); - - // reduce the included columns to the set that is also present - ImmutableSet.Builder presentColumns = ImmutableSet.builder(); - OrcType root = types.get(0); - for (int includedColumn : includedColumns) { - // an old file can have less columns since columns can be added - // after the file was written - if (includedColumn < root.getFieldCount()) { - presentColumns.add(includedColumn); - } - } - this.presentColumns = presentColumns.build(); - - this.orcDataSource = orcDataSource; - this.splitLength = splitLength; - - // it is possible that old versions of orc use 0 to mean there are no row groups - checkArgument(rowsInRowGroup > 0, "rowsInRowGroup must be greater than zero"); - - long totalRowCount = 0; - ImmutableList.Builder stripes = ImmutableList.builder(); - if (predicate.matches(numberOfRows, getStatisticsByColumnOrdinal(root, fileStats))) { - // select stripes that start within the specified split - for (int stripeIndex = 0; stripeIndex < fileStripes.size(); stripeIndex++) { - StripeInformation stripe = fileStripes.get(stripeIndex); - if (splitContainsStripe(splitOffset, splitLength, stripe) && isStripeIncluded(root, stripe, stripeStats, predicate, stripeIndex)) { - stripes.add(stripe); - totalRowCount += stripe.getNumberOfRows(); - } - } - } - this.totalRowCount = totalRowCount; - this.stripes = stripes.build(); - - stripeReader = new StripeReader( - orcDataSource, - compressionKind, - types, - bufferSize, - this.presentColumns, - rowsInRowGroup, - predicate, - metadataReader); - - streamReaders = createStreamReaders(orcDataSource, types, hiveStorageTimeZone, this.presentColumns); - } - - private static boolean splitContainsStripe(long splitOffset, long splitLength, StripeInformation stripe) - { - long splitEndOffset = splitOffset + splitLength; - return splitOffset <= stripe.getOffset() && stripe.getOffset() < splitEndOffset; - } - - private static boolean isStripeIncluded( - OrcType rootStructType, - StripeInformation stripe, - List stripeStats, - OrcPredicate predicate, - int stripeIndex) - { - // if there are no stats, include the column - if (stripeIndex >= stripeStats.size()) { - return true; - } - - return predicate.matches(stripe.getNumberOfRows(), getStatisticsByColumnOrdinal(rootStructType, stripeStats.get(stripeIndex).getColumnStatistics())); - } - - public long getPosition() - { - return currentPosition; - } - - public long getTotalRowCount() - { - return totalRowCount; - } - - public float getProgress() - { - return ((float) currentPosition) / totalRowCount; - } - - public long getSplitLength() - { - return splitLength; - } - - public void close() - throws IOException - { - orcDataSource.close(); - } - - public boolean isColumnPresent(int hiveColumnIndex) - { - return presentColumns.contains(hiveColumnIndex); - } - - public int nextBatch() - throws IOException - { - // if next row is within the current group return - if (nextRowInGroup >= currentGroupRowCount) { - // attempt to advance to next row group - if (!advanceToNextRowGroup()) { - return -1; - } - } - - int batchSize = Ints.checkedCast(Math.min(Vector.MAX_VECTOR_LENGTH, currentGroupRowCount - nextRowInGroup)); - - for (StreamReader column : streamReaders) { - if (column != null) { - column.prepareNextRead(batchSize); - } - } - nextRowInGroup += batchSize; - currentPosition += batchSize; - return batchSize; - } - - public void readVector(int columnIndex, Object vector) - throws IOException - { - streamReaders[columnIndex].readBatch(vector); - } - - private boolean advanceToNextRowGroup() - throws IOException - { - nextRowInGroup = 0; - - while (!rowGroups.hasNext() && currentStripe < stripes.size()) { - advanceToNextStripe(); - } - - if (!rowGroups.hasNext()) { - currentGroupRowCount = 0; - return false; - } - - RowGroup currentRowGroup = rowGroups.next(); - currentGroupRowCount = currentRowGroup.getRowCount(); - - // give reader data streams from row group - StreamSources rowGroupStreamSources = currentRowGroup.getStreamSources(); - for (StreamReader column : streamReaders) { - if (column != null) { - column.startRowGroup(rowGroupStreamSources); - } - } - - return true; - } - - private void advanceToNextStripe() - throws IOException - { - currentStripe++; - if (currentStripe >= stripes.size()) { - return; - } - - StripeInformation stripeInformation = stripes.get(currentStripe); - Stripe stripe = stripeReader.readStripe(stripeInformation); - if (stripe != null) { - // Give readers access to dictionary streams - StreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources(); - List columnEncodings = stripe.getColumnEncodings(); - for (StreamReader column : streamReaders) { - if (column != null) { - column.startStripe(dictionaryStreamSources, columnEncodings); - } - } - - rowGroups = stripe.getRowGroups().iterator(); - } - else { - rowGroups = ImmutableList.of().iterator(); - } - } - - private static StreamReader[] createStreamReaders(OrcDataSource orcDataSource, - List types, - DateTimeZone hiveStorageTimeZone, - Set includedColumns) - { - List streamDescriptors = createStreamDescriptor("", "", 0, types, orcDataSource).getNestedStreams(); - - OrcType rowType = types.get(0); - StreamReader[] streamReaders = new StreamReader[rowType.getFieldCount()]; - for (int columnId = 0; columnId < rowType.getFieldCount(); columnId++) { - if (includedColumns.contains(columnId)) { - StreamDescriptor streamDescriptor = streamDescriptors.get(columnId); - streamReaders[columnId] = StreamReaders.createStreamReader(streamDescriptor, hiveStorageTimeZone); - } - } - return streamReaders; - } - - private static StreamDescriptor createStreamDescriptor(String parentStreamName, String fieldName, int typeId, List types, OrcDataSource dataSource) - { - OrcType type = types.get(typeId); - - if (!fieldName.isEmpty()) { - parentStreamName += "." + fieldName; - } - - ImmutableList.Builder nestedStreams = ImmutableList.builder(); - if (type.getOrcTypeKind() == OrcTypeKind.STRUCT) { - for (int i = 0; i < type.getFieldCount(); ++i) { - nestedStreams.add(createStreamDescriptor(parentStreamName, type.getFieldName(i), type.getFieldTypeIndex(i), types, dataSource)); - } - } - else if (type.getOrcTypeKind() == OrcTypeKind.LIST) { - nestedStreams.add(createStreamDescriptor(parentStreamName, "item", type.getFieldTypeIndex(0), types, dataSource)); - } - else if (type.getOrcTypeKind() == OrcTypeKind.MAP) { - nestedStreams.add(createStreamDescriptor(parentStreamName, "key", type.getFieldTypeIndex(0), types, dataSource)); - nestedStreams.add(createStreamDescriptor(parentStreamName, "value", type.getFieldTypeIndex(1), types, dataSource)); - } - return new StreamDescriptor(parentStreamName, typeId, fieldName, type.getOrcTypeKind(), dataSource, nestedStreams.build()); - } - - private static Map getStatisticsByColumnOrdinal(OrcType rootStructType, List fileStats) - { - checkNotNull(rootStructType, "rootStructType is null"); - checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); - checkNotNull(fileStats, "fileStats is null"); - - ImmutableMap.Builder statistics = ImmutableMap.builder(); - for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { - ColumnStatistics element = fileStats.get(rootStructType.getFieldTypeIndex(ordinal)); - if (element != null) { - statistics.put(ordinal, element); - } - } - return statistics.build(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RowGroup.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RowGroup.java deleted file mode 100644 index a919cd69e1..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RowGroup.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; - -public class RowGroup -{ - private final int groupId; - private final long rowCount; - private final StreamSources streamSources; - - public RowGroup(int groupId, long rowCount, StreamSources streamSources) - { - this.groupId = groupId; - this.rowCount = rowCount; - this.streamSources = checkNotNull(streamSources, "streamSources is null"); - } - - public int getGroupId() - { - return groupId; - } - - public long getRowCount() - { - return rowCount; - } - - public StreamSources getStreamSources() - { - return streamSources; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("groupId", groupId) - .add("rowCount", rowCount) - .add("streamSources", streamSources) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java deleted file mode 100644 index 01cfbfca80..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.annotations.VisibleForTesting; -import io.airlift.slice.Slice; - -public class SliceVector - implements Vector -{ - public final Slice[] vector = new Slice[MAX_VECTOR_LENGTH]; - - @Override - @VisibleForTesting - public ObjectVector toObjectVector(int size) - { - ObjectVector objectVector = new ObjectVector(); - for (int i = 0; i < size; i++) { - if (vector[i] != null) { - objectVector.vector[i] = vector[i].toStringUtf8(); - } - } - return objectVector; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java deleted file mode 100644 index a8108e6f36..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.collect.ImmutableList; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; - -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; - -public final class StreamDescriptor -{ - private final String streamName; - private final int streamId; - private final OrcTypeKind streamType; - private final String fieldName; - private final OrcDataSource fileInput; - private final List nestedStreams; - - public StreamDescriptor(String streamName, int streamId, String fieldName, OrcTypeKind streamType, OrcDataSource fileInput, List nestedStreams) - { - this.streamName = checkNotNull(streamName, "streamName is null"); - this.streamId = streamId; - this.fieldName = checkNotNull(fieldName, "fieldName is null"); - this.streamType = checkNotNull(streamType, "type is null"); - this.fileInput = checkNotNull(fileInput, "fileInput is null"); - this.nestedStreams = ImmutableList.copyOf(checkNotNull(nestedStreams, "nestedStreams is null")); - } - - public String getStreamName() - { - return streamName; - } - - public int getStreamId() - { - return streamId; - } - - public OrcTypeKind getStreamType() - { - return streamType; - } - - public String getFieldName() - { - return fieldName; - } - - public OrcDataSource getFileInput() - { - return fileInput; - } - - public List getNestedStreams() - { - return nestedStreams; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("streamName", streamName) - .add("streamId", streamId) - .add("streamType", streamType) - .add("path", fileInput) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java deleted file mode 100644 index 3cec23c247..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; - -import java.util.Objects; - -import static com.google.common.base.MoreObjects.toStringHelper; - -public final class StreamId -{ - private final int column; - private final StreamKind streamKind; - - public StreamId(Stream stream) - { - this.column = stream.getColumn(); - this.streamKind = stream.getStreamKind(); - } - - public StreamId(int column, StreamKind streamKind) - { - this.column = column; - this.streamKind = streamKind; - } - - public int getColumn() - { - return column; - } - - public StreamKind getStreamKind() - { - return streamKind; - } - - @Override - public int hashCode() - { - return Objects.hash(column, streamKind); - } - - @Override - public boolean equals(Object obj) - { - if (this == obj) { - return true; - } - if (obj == null || getClass() != obj.getClass()) { - return false; - } - StreamId other = (StreamId) obj; - return Objects.equals(this.column, other.column) && Objects.equals(this.streamKind, other.streamKind); - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("column", column) - .add("streamKind", streamKind) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java deleted file mode 100644 index a95353160e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.collect.ImmutableList; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; - -public class Stripe -{ - private final long rowCount; - private final List columnEncodings; - private final List rowGroups; - private final StreamSources dictionaryStreamSources; - - public Stripe(long rowCount, List columnEncodings, List rowGroups, StreamSources dictionaryStreamSources) - { - this.rowCount = rowCount; - this.columnEncodings = checkNotNull(columnEncodings, "columnEncodings is null"); - this.rowGroups = ImmutableList.copyOf(checkNotNull(rowGroups, "rowGroups is null")); - this.dictionaryStreamSources = checkNotNull(dictionaryStreamSources, "dictionaryStreamSources is null"); - } - - public long getRowCount() - { - return rowCount; - } - - public List getColumnEncodings() - { - return columnEncodings; - } - - public List getRowGroups() - { - return rowGroups; - } - - public StreamSources getDictionaryStreamSources() - { - return dictionaryStreamSources; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("rowCount", rowCount) - .add("columnEncodings", columnEncodings) - .add("rowGroups", rowGroups) - .add("dictionaryStreams", dictionaryStreamSources) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java deleted file mode 100644 index 1e4c4bc273..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.base.Function; -import com.google.common.base.Predicates; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Maps; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.metadata.*; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.stream.*; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; - -import java.io.IOException; -import java.io.InputStream; -import java.util.*; -import java.util.Map.Entry; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.getStreamCheckpoints; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.CheckpointStreamSource.createCheckpointStreamSource; - -public class StripeReader -{ - private final OrcDataSource orcDataSource; - private final CompressionKind compressionKind; - private final List types; - private final int bufferSize; - private final Set includedOrcColumns; - private final int rowsInRowGroup; - private final OrcPredicate predicate; - private final MetadataReader metadataReader; - - public StripeReader(OrcDataSource orcDataSource, - CompressionKind compressionKind, - List types, - int bufferSize, - Set includedColumns, - int rowsInRowGroup, - OrcPredicate predicate, - MetadataReader metadataReader) - { - this.orcDataSource = checkNotNull(orcDataSource, "orcDataSource is null"); - this.compressionKind = checkNotNull(compressionKind, "compressionKind is null"); - this.types = ImmutableList.copyOf(checkNotNull(types, "types is null")); - this.bufferSize = bufferSize; - this.includedOrcColumns = getIncludedOrcColumns(types, checkNotNull(includedColumns, "includedColumns is null")); - this.rowsInRowGroup = rowsInRowGroup; - this.predicate = checkNotNull(predicate, "predicate is null"); - this.metadataReader = checkNotNull(metadataReader, "metadataReader is null"); - } - - public Stripe readStripe(StripeInformation stripe) - throws IOException - { - // read the stripe footer - StripeFooter stripeFooter = readStripeFooter(stripe); - List columnEncodings = stripeFooter.getColumnEncodings(); - - // get streams for selected columns - Map streams = new HashMap(); - for (Stream stream : stripeFooter.getStreams()) { - if (includedOrcColumns.contains(stream.getColumn())) { - streams.put(new StreamId(stream), stream); - } - } - - // determine ranges of the stripe to read - Map diskRanges = getDiskRanges(stripeFooter.getStreams()); - diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet())); - - // read the file regions - Map streamsData = readDiskRanges(stripe.getOffset(), diskRanges); - - // read the row index for each column - Map> columnIndexes = readColumnIndexes(streams, streamsData); - - // select the row groups matching the tuple domain - Set selectedRowGroups = selectRowGroups(stripe, columnIndexes); - - // if all row groups are skipped, return null - if (selectedRowGroups.isEmpty()) { - return null; - } - - // value streams - Map> valueStreams = createValueStreams(streams, streamsData, columnEncodings); - - // build the dictionary streams - StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings); - - // build the row groups - List rowGroups = createRowGroups( - stripe.getNumberOfRows(), - streams, - valueStreams, - columnIndexes, - selectedRowGroups, - columnEncodings); - - return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources); - } - - public Map readDiskRanges(final long stripeOffset, Map diskRanges) - throws IOException - { - // transform ranges to have an absolute offset in file - diskRanges = Maps.transformValues(diskRanges, new Function() { - @Override - public DiskRange apply(DiskRange diskRange) - { - return new DiskRange(stripeOffset + diskRange.getOffset(), diskRange.getLength()); - } - }); - - Map streamsData = orcDataSource.readFully(diskRanges); - - return ImmutableMap.copyOf(Maps.transformValues(streamsData, new Function() - { - @Override - public OrcInputStream apply(Slice input) - { - return new OrcInputStream(orcDataSource.toString(), input.getInput(), compressionKind, bufferSize); - } - })); - } - - private Map> createValueStreams(Map streams, Map streamsData, List columnEncodings) - { - ImmutableMap.Builder> valueStreams = ImmutableMap.builder(); - for (Entry entry : streams.entrySet()) { - StreamId streamId = entry.getKey(); - Stream stream = entry.getValue(); - ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind(); - - // skip index and empty streams - if (isIndexStream(stream) || stream.getLength() == 0) { - continue; - } - - OrcInputStream inputStream = streamsData.get(streamId); - OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); - - valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts())); - } - return valueStreams.build(); - } - - public StreamSources createDictionaryStreamSources(Map streams, Map> valueStreams, List columnEncodings) - { - ImmutableMap.Builder> dictionaryStreamBuilder = ImmutableMap.builder(); - for (Entry entry : streams.entrySet()) { - StreamId streamId = entry.getKey(); - Stream stream = entry.getValue(); - int column = stream.getColumn(); - - // only process dictionary streams - ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind(); - if (!isDictionary(stream, columnEncoding)) { - continue; - } - - // skip streams without data - ValueStream valueStream = valueStreams.get(streamId); - if (valueStream == null) { - continue; - } - - OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); - StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding); - - StreamSource streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint); - dictionaryStreamBuilder.put(streamId, streamSource); - } - return new StreamSources(dictionaryStreamBuilder.build()); - } - - private List createRowGroups( - int rowsInStripe, - Map streams, - Map> valueStreams, - Map> columnIndexes, - Set selectedRowGroups, - List encodings) - { - ImmutableList.Builder rowGroupBuilder = ImmutableList.builder(); - - for (int rowGroupId : selectedRowGroups) { - Map checkpoints = getStreamCheckpoints(includedOrcColumns, types, compressionKind, rowGroupId, encodings, streams, columnIndexes); - int rowsInGroup = Math.min(rowsInStripe - (rowGroupId * rowsInRowGroup), rowsInRowGroup); - rowGroupBuilder.add(createRowGroup(rowGroupId, rowsInGroup, valueStreams, checkpoints)); - } - - return rowGroupBuilder.build(); - } - - public static RowGroup createRowGroup(int groupId, int rowCount, Map> valueStreams, Map checkpoints) - { - ImmutableMap.Builder> builder = ImmutableMap.builder(); - for (Entry entry : checkpoints.entrySet()) { - StreamId streamId = entry.getKey(); - StreamCheckpoint checkpoint = entry.getValue(); - - // skip streams without data - ValueStream valueStream = valueStreams.get(streamId); - if (valueStream == null) { - continue; - } - - builder.put(streamId, createCheckpointStreamSource(valueStream, checkpoint)); - } - StreamSources rowGroupStreams = new StreamSources(builder.build()); - return new RowGroup(groupId, rowCount, rowGroupStreams); - } - - public StripeFooter readStripeFooter(StripeInformation stripe) - throws IOException - { - long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(); - int tailLength = Ints.checkedCast(stripe.getFooterLength()); - - // read the footer - byte[] tailBuffer = new byte[tailLength]; - orcDataSource.readFully(offset, tailBuffer); - InputStream inputStream = new OrcInputStream(orcDataSource.toString(), Slices.wrappedBuffer(tailBuffer).getInput(), compressionKind, bufferSize); - return metadataReader.readStripeFooter(types, inputStream); - } - - private Map> readColumnIndexes(Map streams, Map streamsData) - throws IOException - { - ImmutableMap.Builder> columnIndexes = ImmutableMap.builder(); - for (Entry entry : streams.entrySet()) { - Stream stream = entry.getValue(); - if (stream.getStreamKind() == ROW_INDEX) { - OrcInputStream inputStream = streamsData.get(entry.getKey()); - columnIndexes.put(stream.getColumn(), metadataReader.readRowIndexes(inputStream)); - } - } - return columnIndexes.build(); - } - - private Set selectRowGroups(StripeInformation stripe, Map> columnIndexes) - throws IOException - { - int rowsInStripe = Ints.checkedCast(stripe.getNumberOfRows()); - int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup); - - ImmutableSet.Builder selectedRowGroups = ImmutableSet.builder(); - int remainingRows = rowsInStripe; - for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) { - int rows = Math.min(remainingRows, rowsInRowGroup); - Map statistics = getRowGroupStatistics(types.get(0), columnIndexes, rowGroup); - if (predicate.matches(rows, statistics)) { - selectedRowGroups.add(rowGroup); - } - remainingRows -= rows; - } - return selectedRowGroups.build(); - } - - private static Map getRowGroupStatistics(OrcType rootStructType, Map> columnIndexes, int rowGroup) - { - checkNotNull(rootStructType, "rootStructType is null"); - checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); - checkNotNull(columnIndexes, "columnIndexes is null"); - checkArgument(rowGroup >= 0, "rowGroup is negative"); - - ImmutableMap.Builder statistics = ImmutableMap.builder(); - for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { - List rowGroupIndexes = columnIndexes.get(rootStructType.getFieldTypeIndex(ordinal)); - if (rowGroupIndexes != null) { - statistics.put(ordinal, rowGroupIndexes.get(rowGroup).getColumnStatistics()); - } - } - return statistics.build(); - } - - private static boolean isIndexStream(Stream stream) - { - return stream.getStreamKind() == ROW_INDEX || stream.getStreamKind() == DICTIONARY_COUNT; - } - - private static boolean isDictionary(Stream stream, ColumnEncodingKind columnEncoding) - { - return stream.getStreamKind() == DICTIONARY_DATA || (stream.getStreamKind() == LENGTH && (columnEncoding == DICTIONARY || columnEncoding == DICTIONARY_V2)); - } - - private static Map getDiskRanges(List streams) - { - ImmutableMap.Builder streamDiskRanges = ImmutableMap.builder(); - long stripeOffset = 0; - for (Stream stream : streams) { - int streamLength = Ints.checkedCast(stream.getLength()); - streamDiskRanges.put(new StreamId(stream), new DiskRange(stripeOffset, streamLength)); - stripeOffset += streamLength; - } - return streamDiskRanges.build(); - } - - private static Set getIncludedOrcColumns(List types, Set includedColumns) - { - Set includes = new LinkedHashSet(); - - OrcType root = types.get(0); - for (int includedColumn : includedColumns) { - includeOrcColumnsRecursive(types, includes, root.getFieldTypeIndex(includedColumn)); - } - - return includes; - } - - private static void includeOrcColumnsRecursive(List types, Set result, int typeId) - { - result.add(typeId); - OrcType type = types.get(typeId); - int children = type.getFieldCount(); - for (int i = 0; i < children; ++i) { - includeOrcColumnsRecursive(types, result, type.getFieldTypeIndex(i)); - } - } - - /** - * Ceiling of integer division - */ - private static int ceil(int dividend, int divisor) - { - return ((dividend + divisor) - 1) / divisor; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Vector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Vector.java deleted file mode 100644 index e655ac416a..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Vector.java +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.annotations.VisibleForTesting; - -public interface Vector -{ - int MAX_VECTOR_LENGTH = 1024; - - @VisibleForTesting - ObjectVector toObjectVector(int size); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java deleted file mode 100644 index 4fd403e643..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static com.google.common.base.Preconditions.checkNotNull; - -public final class BooleanStreamCheckpoint - implements StreamCheckpoint -{ - private final int offset; - private final ByteStreamCheckpoint byteStreamCheckpoint; - - public BooleanStreamCheckpoint(int offset, ByteStreamCheckpoint byteStreamCheckpoint) - { - this.offset = offset; - this.byteStreamCheckpoint = checkNotNull(byteStreamCheckpoint, "byteStreamCheckpoint is null"); - } - - public BooleanStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - byteStreamCheckpoint = new ByteStreamCheckpoint(compressionKind, positionsList); - offset = positionsList.nextPosition(); - } - - public int getOffset() - { - return offset; - } - - public ByteStreamCheckpoint getByteStreamCheckpoint() - { - return byteStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("offset", offset) - .add("byteStreamCheckpoint", byteStreamCheckpoint) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java deleted file mode 100644 index a76d5c286e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class ByteArrayStreamCheckpoint - implements StreamCheckpoint -{ - private final long inputStreamCheckpoint; - - public ByteArrayStreamCheckpoint(long inputStreamCheckpoint) - { - this.inputStreamCheckpoint = inputStreamCheckpoint; - } - - public ByteArrayStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java deleted file mode 100644 index c7a93ea169..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class ByteStreamCheckpoint - implements StreamCheckpoint -{ - private final int offset; - private final long inputStreamCheckpoint; - - public ByteStreamCheckpoint(int offset, long inputStreamCheckpoint) - { - this.offset = offset; - this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); - } - - public ByteStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - offset = positionsList.nextPosition(); - } - - public int getOffset() - { - return offset; - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("offset", offset) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java deleted file mode 100644 index f346235d94..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java +++ /dev/null @@ -1,405 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.collect.*; -import org.apache.tajo.storage.thirdparty.orc.StreamId; -import org.apache.tajo.storage.thirdparty.orc.metadata.*; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; - -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static com.google.common.base.Preconditions.checkNotNull; -import static com.google.common.base.Preconditions.checkState; -import static com.google.common.base.Predicates.equalTo; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; - -public final class Checkpoints -{ - private Checkpoints() - { - } - - public static Map getStreamCheckpoints( - Set columns, - List columnTypes, - CompressionKind compressionKind, - int rowGroupId, - List columnEncodings, - Map streams, - Map> columnIndexes) - { - ImmutableSetMultimap.Builder streamKindsBuilder = ImmutableSetMultimap.builder(); - for (Stream stream : streams.values()) { - streamKindsBuilder.put(stream.getColumn(), stream.getStreamKind()); - } - SetMultimap streamKinds = streamKindsBuilder.build(); - - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - for (int column : columns) { - List positionsList = columnIndexes.get(column).get(rowGroupId).getPositions(); - - ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind(); - OrcTypeKind columnType = columnTypes.get(column).getOrcTypeKind(); - Set availableStreams = streamKinds.get(column); - - ColumnPositionsList columnPositionsList = new ColumnPositionsList(column, columnType, positionsList); - switch (columnType) { - case BOOLEAN: - checkpoints.putAll(getBooleanColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); - break; - case BYTE: - checkpoints.putAll(getByteColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); - break; - case SHORT: - case INT: - case LONG: - case DATE: - checkpoints.putAll(getLongColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); - break; - case FLOAT: - checkpoints.putAll(getFloatColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); - break; - case DOUBLE: - checkpoints.putAll(getDoubleColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); - break; - case TIMESTAMP: - checkpoints.putAll(getTimestampColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); - break; - case BINARY: - case STRING: - checkpoints.putAll(getSliceColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); - break; - case LIST: - case MAP: - checkpoints.putAll(getListOrMapColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); - break; - case STRUCT: - checkpoints.putAll(getStructColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); - break; - case DECIMAL: - case CHAR: - case VARCHAR: - case UNION: - throw new IllegalArgumentException("Unsupported column type " + columnType); - } - - // The DWRF code is not meticulous in the handling of checkpoints. It appears that for the first row group - // it will write checkpoints for all streams, but in other cases it will write only the streams that exist. - // We detect this case by checking that all offsets in the initial position list are zero, and if so, we - // clear the extra offsets - checkState(!columnPositionsList.hasNextPosition() || Iterables.all(positionsList, equalTo(0)), - "Column %s, of type %s, contains %s offset positions, but only %s positions were consumed", - column, - columnType, - positionsList.size(), - columnPositionsList.getIndex()); - } - return checkpoints.build(); - } - - public static StreamCheckpoint getDictionaryStreamCheckpoint(StreamId streamId, OrcTypeKind columnType, ColumnEncodingKind columnEncoding) - { - if (streamId.getStreamKind() == DICTIONARY_DATA) { - switch (columnType) { - case SHORT: - case INT: - case LONG: - return new LongStreamDwrfCheckpoint(createInputStreamCheckpoint(0, 0)); - case STRING: - case VARCHAR: - case CHAR: - case BINARY: - return new ByteArrayStreamCheckpoint(createInputStreamCheckpoint(0, 0)); - } - } - - // dictionary length and data streams are unsigned long streams - if (streamId.getStreamKind() == LENGTH || streamId.getStreamKind() == DATA) { - if (columnEncoding == DICTIONARY_V2) { - return new LongStreamV2Checkpoint(0, createInputStreamCheckpoint(0, 0)); - } - else if (columnEncoding == DICTIONARY) { - return new LongStreamV1Checkpoint(0, createInputStreamCheckpoint(0, 0)); - } - } - throw new IllegalArgumentException("Unsupported column type " + columnType + " for dictionary stream " + streamId); - } - - private static Map getBooleanColumnCheckpoints( - int column, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getByteColumnCheckpoints( - int column, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), new ByteStreamCheckpoint(compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getLongColumnCheckpoints( - int column, - ColumnEncodingKind encoding, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(IN_DICTIONARY)) { - checkpoints.put(new StreamId(column, IN_DICTIONARY), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getFloatColumnCheckpoints( - int column, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), new FloatStreamCheckpoint(compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getDoubleColumnCheckpoints( - int column, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), new DoubleStreamCheckpoint(compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getTimestampColumnCheckpoints( - int column, - ColumnEncodingKind encoding, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - - if (availableStreams.contains(SECONDARY)) { - checkpoints.put(new StreamId(column, SECONDARY), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getSliceColumnCheckpoints( - int column, - ColumnEncodingKind encoding, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (encoding == DIRECT || encoding == DIRECT_V2) { - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), new ByteArrayStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(LENGTH)) { - checkpoints.put(new StreamId(column, LENGTH), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - } - else if (encoding == DICTIONARY || encoding == DICTIONARY_V2) { - // DWRF has rules inconsistent with the ORC style - if (availableStreams.contains(IN_DICTIONARY)) { - if (availableStreams.contains(ROW_GROUP_DICTIONARY)) { - checkpoints.put(new StreamId(column, ROW_GROUP_DICTIONARY), new ByteArrayStreamCheckpoint(compressionKind, positionsList)); - } - - checkpoints.put(new StreamId(column, ROW_GROUP_DICTIONARY_LENGTH), new RowGroupDictionaryLengthStreamCheckpoint(compressionKind, positionsList)); - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - - checkpoints.put(new StreamId(column, IN_DICTIONARY), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - else { - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - } - } - else { - throw new IllegalArgumentException("Unsupported encoding for slice column: " + encoding); - } - - return checkpoints.build(); - } - - private static Map getListOrMapColumnCheckpoints( - int column, - ColumnEncodingKind encoding, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(LENGTH)) { - checkpoints.put(new StreamId(column, LENGTH), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getStructColumnCheckpoints( - int column, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static StreamCheckpoint createLongStreamCheckpoint(ColumnEncodingKind encoding, CompressionKind compressionKind, ColumnPositionsList positionsList) - { - if (encoding == DIRECT_V2 || encoding == DICTIONARY_V2) { - return new LongStreamV2Checkpoint(compressionKind, positionsList); - } - - if (encoding == DIRECT || encoding == DICTIONARY) { - return new LongStreamV1Checkpoint(compressionKind, positionsList); - } - - if (encoding == DWRF_DIRECT) { - return new LongStreamDwrfCheckpoint(compressionKind, positionsList); - } - - throw new IllegalArgumentException("Unsupported encoding for long stream: " + encoding); - } - - public static class ColumnPositionsList - { - private final int column; - private final OrcTypeKind columnType; - private final List positionsList; - private int index; - - private ColumnPositionsList(int column, OrcTypeKind columnType, List positionsList) - { - this.column = column; - this.columnType = checkNotNull(columnType, "columnType is null"); - this.positionsList = ImmutableList.copyOf(checkNotNull(positionsList, "positionsList is null")); - } - - public int getIndex() - { - return index; - } - - public boolean hasNextPosition() - { - return index < positionsList.size(); - } - - public int nextPosition() - { - checkState(hasNextPosition(), "Not enough positions for column %s, of type %s, checkpoints", - column, - columnType); - - return positionsList.get(index++); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java deleted file mode 100644 index 80f03de1d9..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class DoubleStreamCheckpoint - implements StreamCheckpoint -{ - private final long inputStreamCheckpoint; - - public DoubleStreamCheckpoint(long inputStreamCheckpoint) - { - this.inputStreamCheckpoint = inputStreamCheckpoint; - } - - public DoubleStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java deleted file mode 100644 index 2d92cd3494..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class FloatStreamCheckpoint - implements StreamCheckpoint -{ - private final long inputStreamCheckpoint; - - public FloatStreamCheckpoint(long inputStreamCheckpoint) - { - this.inputStreamCheckpoint = inputStreamCheckpoint; - } - - public FloatStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java deleted file mode 100644 index 92550a6b91..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.UNCOMPRESSED; - -/** - * InputStreamCheckpoint is represented as a packed long to avoid object creation in inner loops. - */ -public final class InputStreamCheckpoint -{ - private InputStreamCheckpoint() - { - } - - public static long createInputStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - if (compressionKind == UNCOMPRESSED) { - return createInputStreamCheckpoint(0, positionsList.nextPosition()); - } - else { - return createInputStreamCheckpoint(positionsList.nextPosition(), positionsList.nextPosition()); - } - } - - public static long createInputStreamCheckpoint(int compressedBlockOffset, int decompressedOffset) - { - return (((long) compressedBlockOffset) << 32) | decompressedOffset; - } - - public static int decodeCompressedBlockOffset(long inputStreamCheckpoint) - { - return ((int) (inputStreamCheckpoint >> 32)); - } - - public static int decodeDecompressedOffset(long inputStreamCheckpoint) - { - // low order bits contain the decompressed offset, so a simple cast here will suffice - return (int) inputStreamCheckpoint; - } - - public static String inputStreamCheckpointToString(long inputStreamCheckpoint) - { - return MoreObjects.toStringHelper(InputStreamCheckpoint.class) - .add("decompressedOffset", decodeDecompressedOffset(inputStreamCheckpoint)) - .add("compressedBlockOffset", decodeCompressedBlockOffset(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamCheckpoint.java deleted file mode 100644 index a142e39f5d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamCheckpoint.java +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -public interface LongStreamCheckpoint - extends StreamCheckpoint -{ -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java deleted file mode 100644 index bb08edd940..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class LongStreamDwrfCheckpoint - implements LongStreamCheckpoint -{ - private final long inputStreamCheckpoint; - - public LongStreamDwrfCheckpoint(long inputStreamCheckpoint) - { - this.inputStreamCheckpoint = inputStreamCheckpoint; - } - - public LongStreamDwrfCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java deleted file mode 100644 index 410f181d38..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public class LongStreamV1Checkpoint - implements LongStreamCheckpoint -{ - private final int offset; - private final long inputStreamCheckpoint; - - public LongStreamV1Checkpoint(int offset, long inputStreamCheckpoint) - { - this.offset = offset; - this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); - } - - public LongStreamV1Checkpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - offset = positionsList.nextPosition(); - } - - public int getOffset() - { - return offset; - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("offset", offset) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java deleted file mode 100644 index 352c4d1bc1..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class LongStreamV2Checkpoint - implements LongStreamCheckpoint -{ - private final int offset; - private final long inputStreamCheckpoint; - - public LongStreamV2Checkpoint(int offset, long inputStreamCheckpoint) - { - this.offset = offset; - this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); - } - - public LongStreamV2Checkpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - offset = positionsList.nextPosition(); - } - - public int getOffset() - { - return offset; - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("offset", offset) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java deleted file mode 100644 index 88ac0515e5..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class RowGroupDictionaryLengthStreamCheckpoint - extends LongStreamV1Checkpoint -{ - private final int rowGroupDictionarySize; - - public RowGroupDictionaryLengthStreamCheckpoint(int rowGroupDictionarySize, int offset, long inputStreamCheckpoint) - { - super(offset, inputStreamCheckpoint); - this.rowGroupDictionarySize = rowGroupDictionarySize; - } - - public RowGroupDictionaryLengthStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - super(compressionKind, positionsList); - rowGroupDictionarySize = positionsList.nextPosition(); - } - - public int getRowGroupDictionarySize() - { - return rowGroupDictionarySize; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("rowGroupDictionarySize", rowGroupDictionarySize) - .add("offset", getOffset()) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(getInputStreamCheckpoint())) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/StreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/StreamCheckpoint.java deleted file mode 100644 index 025c2ae5ad..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/StreamCheckpoint.java +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -public interface StreamCheckpoint -{ -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java deleted file mode 100644 index 65182d49bd..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class BooleanJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private BooleanStream dataStream; - - public BooleanJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - generator.writeBoolean(dataStream.nextBit()); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - return String.valueOf(dataStream.nextBit()); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null values - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, BooleanStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java deleted file mode 100644 index d1008528a1..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.ByteStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class ByteJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private ByteStream dataStream; - - public ByteJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - generator.writeNumber(dataStream.next()); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - return String.valueOf(dataStream.next()); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null values - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java deleted file mode 100644 index 3243ead772..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; -import java.util.concurrent.TimeUnit; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class DateJsonReader - implements JsonMapKeyReader -{ - private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1); - - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private LongStream dataStream; - - public DateJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - long millis = dataStream.next() * MILLIS_IN_DAY; - generator.writeNumber(millis); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - long millis = dataStream.next() * MILLIS_IN_DAY; - return String.valueOf(millis); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null values - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java deleted file mode 100644 index 1adf00aeec..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.DoubleStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class DoubleJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private DoubleStream dataStream; - - public DoubleJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - double value = dataStream.next(); - generator.writeNumber(value); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - double value = dataStream.next(); - return String.valueOf(value); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null values - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, DoubleStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java deleted file mode 100644 index 0b4f668dff..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.FloatStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class FloatJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private FloatStream dataStream; - - public FloatJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // write value as a double to avoid strange rounding errors - double value = dataStream.next(); - generator.writeNumber(value); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // write value as a double to avoid strange rounding errors - double value = dataStream.next(); - return String.valueOf(value); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null values - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, FloatStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java deleted file mode 100644 index 6e93f8abb2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import java.io.IOException; - -public interface JsonMapKeyReader - extends JsonReader -{ - String nextValueAsMapKey() - throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java deleted file mode 100644 index f35cbe6d82..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.io.IOException; -import java.util.List; - -public interface JsonReader -{ - void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException; - - void openRowGroup(StreamSources dataStreamSources) - throws IOException; - - void readNextValueInto(JsonGenerator generator) - throws IOException; - - void skip(int skipSize) - throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java deleted file mode 100644 index 06019757d2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.joda.time.DateTimeZone; - -public final class JsonReaders -{ - private JsonReaders() - { - } - - public static JsonMapKeyReader createJsonMapKeyReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) - { - switch (streamDescriptor.getStreamType()) { - case BOOLEAN: - return new BooleanJsonReader(streamDescriptor); - case BYTE: - return new ByteJsonReader(streamDescriptor); - case SHORT: - case INT: - case LONG: - return new LongJsonReader(streamDescriptor); - case FLOAT: - return new FloatJsonReader(streamDescriptor); - case DOUBLE: - return new DoubleJsonReader(streamDescriptor); - case BINARY: - return new SliceJsonReader(streamDescriptor, true); - case STRING: - return new SliceJsonReader(streamDescriptor, false); - case TIMESTAMP: - return new TimestampJsonReader(streamDescriptor, hiveStorageTimeZone); - case DATE: - return new DateJsonReader(streamDescriptor); - case STRUCT: - case LIST: - case MAP: - case UNION: - case DECIMAL: - case VARCHAR: - case CHAR: - default: - throw new IllegalArgumentException("Unsupported map key type: " + streamDescriptor.getStreamType()); - } - } - - public static JsonReader createJsonReader( - StreamDescriptor streamDescriptor, - boolean checkForNulls, - DateTimeZone hiveStorageTimeZone) - { - switch (streamDescriptor.getStreamType()) { - case BOOLEAN: - return new BooleanJsonReader(streamDescriptor); - case BYTE: - return new ByteJsonReader(streamDescriptor); - case SHORT: - case INT: - case LONG: - return new LongJsonReader(streamDescriptor); - case FLOAT: - return new FloatJsonReader(streamDescriptor); - case DOUBLE: - return new DoubleJsonReader(streamDescriptor); - case BINARY: - return new SliceJsonReader(streamDescriptor, true); - case STRING: - return new SliceJsonReader(streamDescriptor, false); - case TIMESTAMP: - return new TimestampJsonReader(streamDescriptor, hiveStorageTimeZone); - case DATE: - return new DateJsonReader(streamDescriptor); - case STRUCT: - return new StructJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); - case LIST: - return new ListJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); - case MAP: - return new MapJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); - case UNION: - case DECIMAL: - case VARCHAR: - case CHAR: - default: - throw new IllegalArgumentException("Unsupported type: " + streamDescriptor.getStreamType()); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java deleted file mode 100644 index d6302fb8b5..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.LENGTH; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class ListJsonReader - implements JsonReader -{ - private final StreamDescriptor streamDescriptor; - private final boolean checkForNulls; - - private final JsonReader elementReader; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private LongStream lengthStream; - - public ListJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.checkForNulls = checkForNulls; - - elementReader = createJsonReader(streamDescriptor.getNestedStreams().get(0), true, hiveStorageTimeZone); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - long length = lengthStream.next(); - generator.writeStartArray(); - for (int i = 0; i < length; i++) { - elementReader.readNextValueInto(generator); - } - generator.writeEndArray(); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - long elementSkipSize = lengthStream.sum(skipSize); - elementReader.skip(Ints.checkedCast(elementSkipSize)); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - lengthStream = null; - - elementReader.openStripe(dictionaryStreamSources, encoding); - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - if (checkForNulls) { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - } - lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); - - elementReader.openRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java deleted file mode 100644 index b26fc9ab5b..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; - -public class LongDictionaryJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - @Nullable - private BooleanStream inDictionaryStream; - @Nullable - private LongStream dataStream; - - @Nonnull - private long[] dictionary = new long[0]; - - public LongDictionaryJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - generator.writeNumber(nextValue()); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - return String.valueOf(nextValue()); - } - - private long nextValue() - throws IOException - { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - long value = dataStream.next(); - if (inDictionaryStream == null || inDictionaryStream.nextBit()) { - value = dictionary[((int) value)]; - } - return value; - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - // skip non-null values - if (inDictionaryStream != null) { - inDictionaryStream.skip(skipSize); - } - if (skipSize > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(skipSize); - } - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - int dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); - if (dictionarySize > 0) { - if (dictionary.length < dictionarySize) { - dictionary = new long[dictionarySize]; - } - - LongStream dictionaryStream = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, LongStream.class).openStream(); - verifyFormat(dictionaryStream != null, "Dictionary is not empty but data stream is not present"); - dictionaryStream.nextLongVector(dictionarySize, dictionary); - } - - presentStream = null; - inDictionaryStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - inDictionaryStream = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java deleted file mode 100644 index b6edb82db2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class LongDirectJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - @Nullable - private LongStream dataStream; - - public LongDirectJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - generator.writeNumber(dataStream.next()); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - return String.valueOf(dataStream.next()); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - // skip non-null values - if (skipSize > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(skipSize); - } - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java deleted file mode 100644 index 4793a11280..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; - -public class LongJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - private final LongDirectJsonReader directReader; - - private final LongDictionaryJsonReader dictionaryReader; - private JsonMapKeyReader currentReader; - - public LongJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - directReader = new LongDirectJsonReader(streamDescriptor); - dictionaryReader = new LongDictionaryJsonReader(streamDescriptor); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - currentReader.readNextValueInto(generator); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - return currentReader.nextValueAsMapKey(); - } - - @Override - public void skip(int skipSize) - throws IOException - { - currentReader.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - ColumnEncodingKind kind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); - if (kind == DIRECT || kind == DIRECT_V2 || kind == DWRF_DIRECT) { - currentReader = directReader; - } - else if (kind == DICTIONARY || kind == DICTIONARY_V2) { - currentReader = dictionaryReader; - } - else { - throw new IllegalArgumentException("Unsupported encoding " + kind); - } - - currentReader.openStripe(dictionaryStreamSources, encoding); - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - currentReader.openRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java deleted file mode 100644 index 5b6b73b055..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonMapKeyReader; -import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.LENGTH; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class MapJsonReader - implements JsonReader -{ - private final StreamDescriptor streamDescriptor; - private final boolean checkForNulls; - - private final JsonMapKeyReader keyReader; - private final JsonReader valueReader; - - @Nullable - private BooleanStream presentStream; - @Nullable - private LongStream lengthStream; - - public MapJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.checkForNulls = checkForNulls; - - keyReader = createJsonMapKeyReader(streamDescriptor.getNestedStreams().get(0), hiveStorageTimeZone); - valueReader = createJsonReader(streamDescriptor.getNestedStreams().get(1), true, hiveStorageTimeZone); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - long length = lengthStream.next(); - generator.writeStartObject(); - for (int i = 0; i < length; i++) { - String name = keyReader.nextValueAsMapKey(); - if (name == null) { - valueReader.skip(1); - } - else { - generator.writeFieldName(name); - valueReader.readNextValueInto(generator); - } - } - generator.writeEndObject(); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - // skip non-null values - long elementSkipSize = lengthStream.sum(skipSize); - keyReader.skip(Ints.checkedCast(elementSkipSize)); - valueReader.skip(Ints.checkedCast(elementSkipSize)); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - lengthStream = null; - - keyReader.openStripe(dictionaryStreamSources, encoding); - valueReader.openStripe(dictionaryStreamSources, encoding); - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - if (checkForNulls) { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - } - lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); - - keyReader.openRowGroup(dataStreamSources); - valueReader.openRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java deleted file mode 100644 index bf7cb6fc13..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import com.google.common.io.BaseEncoding; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.*; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static java.nio.charset.StandardCharsets.UTF_8; - -public class SliceDictionaryJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - private final boolean writeBinary; - - @Nonnull - private DictionaryEntry[] dictionary = new DictionaryEntry[0]; - - @Nonnull - private int[] dictionaryLength = new int[0]; - - @Nonnull - private DictionaryEntry[] rowGroupDictionary = new DictionaryEntry[0]; - - @Nonnull - private int[] rowGroupDictionaryLength = new int[0]; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private BooleanStream inDictionaryStream; - - @Nullable - private LongStream dataStream; - - public SliceDictionaryJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.writeBinary = writeBinary; - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - DictionaryEntry value = getNextValue(); - - byte[] data = value.getData(); - int offset = value.getOffset(); - int length = value.length(); - if (writeBinary) { - generator.writeBinary(data, offset, length); - } - else { - generator.writeUTF8String(data, offset, length); - } - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - DictionaryEntry value = getNextValue(); - - byte[] data = value.getData(); - int offset = value.getOffset(); - int length = value.length(); - if (writeBinary) { - return BaseEncoding.base64().encode(data, offset, length); - } - else { - return new String(data, offset, length, UTF_8); - } - } - - private DictionaryEntry getNextValue() - throws IOException - { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - int dictionaryIndex = Ints.checkedCast(dataStream.next()); - - DictionaryEntry value; - if (inDictionaryStream == null || inDictionaryStream.nextBit()) { - value = dictionary[dictionaryIndex]; - } - else { - value = rowGroupDictionary[dictionaryIndex]; - } - return value; - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null length - if (inDictionaryStream != null) { - inDictionaryStream.skip(skipSize); - } - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - int dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); - if (dictionarySize > 0) { - // resize the dictionary array if necessary - if (dictionary.length < dictionarySize) { - dictionary = new DictionaryEntry[dictionarySize]; - dictionaryLength = new int[dictionarySize]; - } - - LongStream lengthStream = dictionaryStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); - verifyFormat(lengthStream != null, "Dictionary is not empty but length stream is not present"); - lengthStream.nextIntVector(dictionarySize, dictionaryLength); - - ByteArrayStream dictionaryDataStream = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, ByteArrayStream.class).openStream(); - readDictionary(dictionaryDataStream, dictionarySize, dictionaryLength, dictionary); - } - - presentStream = null; - dataStream = null; - inDictionaryStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - RowGroupDictionaryLengthStream lengthStream = dataStreamSources.getStreamSource( - streamDescriptor, - ROW_GROUP_DICTIONARY_LENGTH, - RowGroupDictionaryLengthStream.class).openStream(); - - if (lengthStream == null) { - inDictionaryStream = null; - } - else { - inDictionaryStream = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class).openStream(); - - int dictionaryEntryCount = lengthStream.getEntryCount(); - - // resize the dictionary array if necessary - if (rowGroupDictionary.length < dictionaryEntryCount) { - rowGroupDictionary = new DictionaryEntry[dictionaryEntryCount]; - rowGroupDictionaryLength = new int[dictionaryEntryCount]; - } - - // read the lengths - lengthStream.nextIntVector(dictionaryEntryCount, rowGroupDictionaryLength); - - ByteArrayStream dictionaryDataStream = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY, ByteArrayStream.class).openStream(); - readDictionary(dictionaryDataStream, dictionaryEntryCount, rowGroupDictionaryLength, rowGroupDictionary); - } - - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); - } - - private static void readDictionary(ByteArrayStream dictionaryDataStream, int dictionarySize, int[] dictionaryLength, DictionaryEntry[] dictionary) - throws IOException - { - // sum lengths - int totalLength = 0; - for (int i = 0; i < dictionarySize; i++) { - totalLength += dictionaryLength[i]; - } - - // read dictionary data - byte[] dictionaryData = new byte[0]; - if (totalLength > 0) { - verifyFormat(dictionaryDataStream != null, "Dictionary length is not zero but dictionary data stream is not present"); - dictionaryData = dictionaryDataStream.next(totalLength); - } - - // build dictionary slices - int offset = 0; - for (int i = 0; i < dictionarySize; i++) { - int length = dictionaryLength[i]; - dictionary[i] = new DictionaryEntry(dictionaryData, offset, length); - offset += length; - } - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } - - private static class DictionaryEntry - { - private final byte[] dictionary; - private final int offset; - private final int length; - - public DictionaryEntry(byte[] dictionary, int offset, int length) - { - this.dictionary = dictionary; - this.offset = offset; - this.length = length; - } - - public int length() - { - return length; - } - - public byte[] getData() - { - return dictionary; - } - - public int getOffset() - { - return offset; - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java deleted file mode 100644 index 6f6630c59e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import com.google.common.io.BaseEncoding; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.ByteArrayStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static java.nio.charset.StandardCharsets.UTF_8; - -public class SliceDirectJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - private final boolean writeBinary; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private LongStream lengthStream; - - @Nullable - private ByteArrayStream dataStream; - - @Nonnull - private byte[] data = new byte[1024]; - - public SliceDirectJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.writeBinary = writeBinary; - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - int length = bufferNextValue(); - - if (writeBinary) { - generator.writeBinary(data, 0, length); - } - else { - generator.writeUTF8String(data, 0, length); - } - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - int length = bufferNextValue(); - - if (writeBinary) { - return BaseEncoding.base64().encode(data, 0, length); - } - else { - return new String(data, 0, length, UTF_8); - } - } - - private int bufferNextValue() - throws IOException - { - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - int length = Ints.checkedCast(lengthStream.next()); - if (data.length < length) { - data = new byte[length]; - } - - if (length > 0) { - verifyFormat(dataStream != null, "Length is not zero but data stream is not present"); - dataStream.next(length, data); - } - return length; - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - // skip non-null length - long dataSkipSize = lengthStream.sum(skipSize); - - if (dataSkipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Length is not zero but data stream is not present"); - - // skip data bytes - dataStream.skip(Ints.checkedCast(dataSkipSize)); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - lengthStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteArrayStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java deleted file mode 100644 index 68892ca244..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; - -public class SliceJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - private final SliceDirectJsonReader directReader; - private final SliceDictionaryJsonReader dictionaryReader; - private JsonMapKeyReader currentReader; - - public SliceJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - directReader = new SliceDirectJsonReader(streamDescriptor, writeBinary); - dictionaryReader = new SliceDictionaryJsonReader(streamDescriptor, writeBinary); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - currentReader.readNextValueInto(generator); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - return currentReader.nextValueAsMapKey(); - } - - @Override - public void skip(int skipSize) - throws IOException - { - currentReader.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, - List encoding) - throws IOException - { - ColumnEncodingKind columnEncodingKind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); - if (columnEncodingKind == DIRECT || columnEncodingKind == DIRECT_V2 || columnEncodingKind == ColumnEncodingKind.DWRF_DIRECT) { - currentReader = directReader; - } - else if (columnEncodingKind == DICTIONARY || columnEncodingKind == DICTIONARY_V2) { - currentReader = dictionaryReader; - } - else { - throw new IllegalArgumentException("Unsupported encoding " + columnEncodingKind); - } - - currentReader.openStripe(dictionaryStreamSources, encoding); - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - currentReader.openRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java deleted file mode 100644 index 600b7b778d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class StructJsonReader - implements JsonReader -{ - private final StreamDescriptor streamDescriptor; - private final boolean checkForNulls; - private final JsonReader[] structFields; - - @Nullable - private BooleanStream presentStream; - - public StructJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.checkForNulls = checkForNulls; - - List nestedStreams = streamDescriptor.getNestedStreams(); - this.structFields = new JsonReader[nestedStreams.size()]; - for (int i = 0; i < nestedStreams.size(); i++) { - StreamDescriptor nestedStream = nestedStreams.get(i); - this.structFields[i] = createJsonReader(nestedStream, true, hiveStorageTimeZone); - } - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - generator.writeStartArray(); - for (JsonReader structField : structFields) { - structField.readNextValueInto(generator); - } - generator.writeEndArray(); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - // skip non-null values - for (JsonReader structField : structFields) { - structField.skip(skipSize); - } - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - - for (JsonReader structField : structFields) { - structField.openStripe(dictionaryStreamSources, encoding); - } - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - if (checkForNulls) { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - } - - for (JsonReader structField : structFields) { - structField.openRowGroup(dataStreamSources); - } - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java deleted file mode 100644 index bfebf78658..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.reader.TimestampStreamReader.decodeTimestamp; - -public class TimestampJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - private final long baseTimestampInSeconds; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private LongStream secondsStream; - - @Nullable - private LongStream nanosStream; - - public TimestampJsonReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.baseTimestampInSeconds = new DateTime(2015, 1, 1, 0, 0, checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null")).getMillis() / 1000; - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - long timestamp = decodeTimestamp(secondsStream.next(), nanosStream.next(), baseTimestampInSeconds); - generator.writeNumber(timestamp); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - long timestamp = decodeTimestamp(secondsStream.next(), nanosStream.next(), baseTimestampInSeconds); - return String.valueOf(timestamp); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - // skip non-null values - secondsStream.skip(skipSize); - nanosStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - secondsStream = null; - nanosStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - secondsStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); - nanosStream = dataStreamSources.getStreamSource(streamDescriptor, SECONDARY, LongStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/BooleanStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/BooleanStatistics.java deleted file mode 100644 index 971cfe3779..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/BooleanStatistics.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -public class BooleanStatistics -{ - private final long trueValueCount; - - public BooleanStatistics(long trueValueCount) - { - this.trueValueCount = trueValueCount; - } - - public long getTrueValueCount() - { - return trueValueCount; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnEncoding.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnEncoding.java deleted file mode 100644 index 5713a28ec4..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnEncoding.java +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; - -public class ColumnEncoding -{ - public enum ColumnEncodingKind - { - DIRECT, - DICTIONARY, - DIRECT_V2, - DICTIONARY_V2, - DWRF_DIRECT, - } - - private final ColumnEncodingKind columnEncodingKind; - private final int dictionarySize; - - public ColumnEncoding(ColumnEncodingKind columnEncodingKind, int dictionarySize) - { - this.columnEncodingKind = checkNotNull(columnEncodingKind, "columnEncodingKind is null"); - this.dictionarySize = dictionarySize; - } - - public ColumnEncodingKind getColumnEncodingKind() - { - return columnEncodingKind; - } - - public int getDictionarySize() - { - return dictionarySize; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("columnEncodingKind", columnEncodingKind) - .add("dictionarySize", dictionarySize) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnStatistics.java deleted file mode 100644 index 7f1e0d988a..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/ColumnStatistics.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -public class ColumnStatistics -{ - private final Long numberOfValues; - private final BooleanStatistics booleanStatistics; - private final IntegerStatistics integerStatistics; - private final DoubleStatistics doubleStatistics; - private final StringStatistics stringStatistics; - private final DateStatistics dateStatistics; - - public ColumnStatistics(Long numberOfValues, - BooleanStatistics booleanStatistics, - IntegerStatistics integerStatistics, - DoubleStatistics doubleStatistics, - StringStatistics stringStatistics, - DateStatistics dateStatistics) - { - this.numberOfValues = numberOfValues; - this.booleanStatistics = booleanStatistics; - this.integerStatistics = integerStatistics; - this.doubleStatistics = doubleStatistics; - this.stringStatistics = stringStatistics; - this.dateStatistics = dateStatistics; - } - - public boolean hasNumberOfValues() - { - return numberOfValues != null; - } - - public long getNumberOfValues() - { - return numberOfValues == null ? 0 : numberOfValues; - } - - public BooleanStatistics getBooleanStatistics() - { - return booleanStatistics; - } - - public DateStatistics getDateStatistics() - { - return dateStatistics; - } - - public DoubleStatistics getDoubleStatistics() - { - return doubleStatistics; - } - - public IntegerStatistics getIntegerStatistics() - { - return integerStatistics; - } - - public StringStatistics getStringStatistics() - { - return stringStatistics; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/CompressionKind.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/CompressionKind.java deleted file mode 100644 index 1b34f17e9e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/CompressionKind.java +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -public enum CompressionKind -{ - UNCOMPRESSED, ZLIB, SNAPPY -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DateStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DateStatistics.java deleted file mode 100644 index 5a1b409506..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DateStatistics.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -public class DateStatistics - implements RangeStatistics -{ - private final Integer minimum; - private final Integer maximum; - - public DateStatistics(Integer minimum, Integer maximum) - { - this.minimum = minimum; - this.maximum = maximum; - } - - @Override - public Integer getMin() - { - return minimum; - } - - @Override - public Integer getMax() - { - return maximum; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DoubleStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DoubleStatistics.java deleted file mode 100644 index 5bb13e7307..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DoubleStatistics.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -public class DoubleStatistics - implements RangeStatistics -{ - private final Double minimum; - private final Double maximum; - - public DoubleStatistics(Double minimum, Double maximum) - { - this.minimum = minimum; - this.maximum = maximum; - } - - @Override - public Double getMin() - { - return minimum; - } - - @Override - public Double getMax() - { - return maximum; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java deleted file mode 100644 index 20ae97058e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import com.facebook.hive.orc.OrcProto; -import com.facebook.hive.orc.OrcProto.ColumnEncoding.Kind; -import com.google.common.base.Function; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Iterables; -import com.google.common.primitives.Ints; -import com.google.protobuf.CodedInputStream; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkState; -import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; - -public class DwrfMetadataReader - implements MetadataReader -{ - @Override - public PostScript readPostScript(byte[] data, int offset, int length) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(data, offset, length); - OrcProto.PostScript postScript = OrcProto.PostScript.parseFrom(input); - - return new PostScript( - ImmutableList.of(), - postScript.getFooterLength(), - 0, - toCompression(postScript.getCompression()), - postScript.getCompressionBlockSize()); - } - - @Override - public Metadata readMetadata(InputStream inputStream) - throws IOException - { - return new Metadata(ImmutableList.of()); - } - - @Override - public Footer readFooter(InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.Footer footer = OrcProto.Footer.parseFrom(input); - return new Footer( - footer.getNumberOfRows(), - footer.getRowIndexStride(), - toStripeInformation(footer.getStripesList()), - toType(footer.getTypesList()), - toColumnStatistics(footer.getStatisticsList(), false)); - } - - private static List toStripeInformation(List types) - { - return ImmutableList.copyOf(Iterables.transform(types, new Function() - { - @Override - public StripeInformation apply(OrcProto.StripeInformation type) - { - return toStripeInformation(type); - } - })); - } - - private static StripeInformation toStripeInformation(OrcProto.StripeInformation stripeInformation) - { - return new StripeInformation( - Ints.checkedCast(stripeInformation.getNumberOfRows()), - stripeInformation.getOffset(), - stripeInformation.getIndexLength(), - stripeInformation.getDataLength(), - stripeInformation.getFooterLength()); - } - - @Override - public StripeFooter readStripeFooter(List types, InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.StripeFooter stripeFooter = OrcProto.StripeFooter.parseFrom(input); - return new StripeFooter(toStream(stripeFooter.getStreamsList()), toColumnEncoding(types, stripeFooter.getColumnsList())); - } - - private static Stream toStream(OrcProto.Stream stream) - { - return new Stream(stream.getColumn(), toStreamKind(stream.getKind()), Ints.checkedCast(stream.getLength()), stream.getUseVInts()); - } - - private static List toStream(List streams) - { - return ImmutableList.copyOf(Iterables.transform(streams, new Function() - { - @Override - public Stream apply(OrcProto.Stream stream) - { - return toStream(stream); - } - })); - } - - private static ColumnEncoding toColumnEncoding(OrcTypeKind type, OrcProto.ColumnEncoding columnEncoding) - { - return new ColumnEncoding(toColumnEncodingKind(type, columnEncoding.getKind()), columnEncoding.getDictionarySize()); - } - - private static List toColumnEncoding(List types, List columnEncodings) - { - checkArgument(types.size() == columnEncodings.size()); - - ImmutableList.Builder encodings = ImmutableList.builder(); - for (int i = 0; i < types.size(); i++) { - OrcType type = types.get(i); - encodings.add(toColumnEncoding(type.getOrcTypeKind(), columnEncodings.get(i))); - } - return encodings.build(); - } - - @Override - public List readRowIndexes(InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.RowIndex rowIndex = OrcProto.RowIndex.parseFrom(input); - return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() - { - @Override - public RowGroupIndex apply(OrcProto.RowIndexEntry rowIndexEntry) - { - return toRowGroupIndex(rowIndexEntry); - } - })); - } - - private static RowGroupIndex toRowGroupIndex(OrcProto.RowIndexEntry rowIndexEntry) - { - List positionsList = rowIndexEntry.getPositionsList(); - ImmutableList.Builder positions = ImmutableList.builder(); - for (int index = 0; index < positionsList.size(); index++) { - long longPosition = positionsList.get(index); - int intPosition = (int) longPosition; - - checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index); - - positions.add(intPosition); - } - return new RowGroupIndex(positions.build(), toColumnStatistics(rowIndexEntry.getStatistics(), true)); - } - - private static List toColumnStatistics(List columnStatistics, final boolean isRowGroup) - { - if (columnStatistics == null) { - return ImmutableList.of(); - } - return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() - { - @Override - public ColumnStatistics apply(OrcProto.ColumnStatistics columnStatistics) - { - return toColumnStatistics(columnStatistics, isRowGroup); - } - })); - } - - private static ColumnStatistics toColumnStatistics(OrcProto.ColumnStatistics statistics, boolean isRowGroup) - { - return new ColumnStatistics( - statistics.getNumberOfValues(), - toBooleanStatistics(statistics.getBucketStatistics()), - toIntegerStatistics(statistics.getIntStatistics()), - toDoubleStatistics(statistics.getDoubleStatistics()), - toStringStatistics(statistics.getStringStatistics(), isRowGroup), - null); - } - - private static BooleanStatistics toBooleanStatistics(OrcProto.BucketStatistics bucketStatistics) - { - if (bucketStatistics.getCountCount() == 0) { - return null; - } - - return new BooleanStatistics(bucketStatistics.getCount(0)); - } - - private static IntegerStatistics toIntegerStatistics(OrcProto.IntegerStatistics integerStatistics) - { - if (!integerStatistics.hasMinimum() && !integerStatistics.hasMaximum()) { - return null; - } - - return new IntegerStatistics( - integerStatistics.hasMinimum() ? integerStatistics.getMinimum() : null, - integerStatistics.hasMaximum() ? integerStatistics.getMaximum() : null); - } - - private static DoubleStatistics toDoubleStatistics(OrcProto.DoubleStatistics doubleStatistics) - { - if (!doubleStatistics.hasMinimum() && !doubleStatistics.hasMaximum()) { - return null; - } - - // TODO remove this when double statistics are changed to correctly deal with NaNs - // if either min or max is NaN, ignore the stat - if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) || - (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum()))) { - return null; - } - - return new DoubleStatistics( - doubleStatistics.hasMinimum() ? doubleStatistics.getMinimum() : null, - doubleStatistics.hasMaximum() ? doubleStatistics.getMaximum() : null); - } - - private static StringStatistics toStringStatistics(OrcProto.StringStatistics stringStatistics, boolean isRowGroup) - { - // TODO remove this when string statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 - if (!isRowGroup) { - return null; - } - - if (!stringStatistics.hasMinimum() && !stringStatistics.hasMaximum()) { - return null; - } - - // temporarily disable string statistics until we figure out the implications of how UTF-16 - // strings are compared when they contain surrogate pairs and replacement characters - if (true) { - return null; - } - - return new StringStatistics( - stringStatistics.hasMinimum() ? stringStatistics.getMinimum() : null, - stringStatistics.hasMaximum() ? stringStatistics.getMaximum() : null); - } - - private static OrcType toType(OrcProto.Type type) - { - return new OrcType(toTypeKind(type.getKind()), type.getSubtypesList(), type.getFieldNamesList()); - } - - private static List toType(List types) - { - return ImmutableList.copyOf(Iterables.transform(types, new Function() - { - @Override - public OrcType apply(OrcProto.Type type) - { - return toType(type); - } - })); - } - - private static OrcTypeKind toTypeKind(OrcProto.Type.Kind kind) - { - switch (kind) { - case BOOLEAN: - return OrcTypeKind.BOOLEAN; - case BYTE: - return OrcTypeKind.BYTE; - case SHORT: - return OrcTypeKind.SHORT; - case INT: - return OrcTypeKind.INT; - case LONG: - return OrcTypeKind.LONG; - case FLOAT: - return OrcTypeKind.FLOAT; - case DOUBLE: - return OrcTypeKind.DOUBLE; - case STRING: - return OrcTypeKind.STRING; - case BINARY: - return OrcTypeKind.BINARY; - case TIMESTAMP: - return OrcTypeKind.TIMESTAMP; - case LIST: - return OrcTypeKind.LIST; - case MAP: - return OrcTypeKind.MAP; - case STRUCT: - return OrcTypeKind.STRUCT; - case UNION: - return OrcTypeKind.UNION; - default: - throw new IllegalArgumentException(kind + " data type not implemented yet"); - } - } - - private static StreamKind toStreamKind(OrcProto.Stream.Kind kind) - { - switch (kind) { - case PRESENT: - return StreamKind.PRESENT; - case DATA: - return StreamKind.DATA; - case LENGTH: - return StreamKind.LENGTH; - case DICTIONARY_DATA: - return StreamKind.DICTIONARY_DATA; - case DICTIONARY_COUNT: - return StreamKind.DICTIONARY_COUNT; - case NANO_DATA: - return StreamKind.SECONDARY; - case ROW_INDEX: - return StreamKind.ROW_INDEX; - case IN_DICTIONARY: - return StreamKind.IN_DICTIONARY; - case STRIDE_DICTIONARY: - return StreamKind.ROW_GROUP_DICTIONARY; - case STRIDE_DICTIONARY_LENGTH: - return StreamKind.ROW_GROUP_DICTIONARY_LENGTH; - default: - throw new IllegalArgumentException(kind + " stream type not implemented yet"); - } - } - - private static ColumnEncodingKind toColumnEncodingKind(OrcTypeKind type, Kind kind) - { - switch (kind) { - case DIRECT: - if (type == OrcTypeKind.SHORT || type == OrcTypeKind.INT || type == OrcTypeKind.LONG) { - return ColumnEncodingKind.DWRF_DIRECT; - } - else { - return ColumnEncodingKind.DIRECT; - } - case DICTIONARY: - return ColumnEncodingKind.DICTIONARY; - default: - throw new IllegalArgumentException(kind + " stream encoding not implemented yet"); - } - } - - private static CompressionKind toCompression(OrcProto.CompressionKind compression) - { - switch (compression) { - case NONE: - return UNCOMPRESSED; - case ZLIB: - return ZLIB; - case SNAPPY: - return SNAPPY; - default: - throw new IllegalArgumentException(compression + " compression not implemented yet"); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Footer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Footer.java deleted file mode 100644 index 54bc53fa54..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Footer.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import com.google.common.collect.ImmutableList; - -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; - -public class Footer -{ - private final long numberOfRows; - private final int rowsInRowGroup; - private final List stripes; - private final List types; - private final List fileStats; - - public Footer(long numberOfRows, int rowsInRowGroup, List stripes, List types, List fileStats) - { - this.numberOfRows = numberOfRows; - this.rowsInRowGroup = rowsInRowGroup; - this.stripes = ImmutableList.copyOf(checkNotNull(stripes, "stripes is null")); - this.types = ImmutableList.copyOf(checkNotNull(types, "types is null")); - this.fileStats = ImmutableList.copyOf(checkNotNull(fileStats, "columnStatistics is null")); - } - - public long getNumberOfRows() - { - return numberOfRows; - } - - public int getRowsInRowGroup() - { - return rowsInRowGroup; - } - - public List getStripes() - { - return stripes; - } - - public List getTypes() - { - return types; - } - - public List getFileStats() - { - return fileStats; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("numberOfRows", numberOfRows) - .add("rowsInRowGroup", rowsInRowGroup) - .add("stripes", stripes) - .add("types", types) - .add("columnStatistics", fileStats) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/IntegerStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/IntegerStatistics.java deleted file mode 100644 index 6ab7b1252c..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/IntegerStatistics.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -public class IntegerStatistics - implements RangeStatistics -{ - private final Long minimum; - private final Long maximum; - - public IntegerStatistics(Long minimum, Long maximum) - { - this.minimum = minimum; - this.maximum = maximum; - } - - public Long getMin() - { - return minimum; - } - - public Long getMax() - { - return maximum; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Metadata.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Metadata.java deleted file mode 100644 index 29d8068332..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Metadata.java +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import java.util.List; - -public class Metadata -{ - private final List stripeStatistics; - - public Metadata(List stripeStatistics) - { - this.stripeStatistics = stripeStatistics; - } - - public List getStripeStatsList() - { - return stripeStatistics; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/MetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/MetadataReader.java deleted file mode 100644 index a6805dc94f..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/MetadataReader.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; - -public interface MetadataReader -{ - PostScript readPostScript(byte[] data, int offset, int length) - throws IOException; - - Metadata readMetadata(InputStream inputStream) - throws IOException; - - Footer readFooter(InputStream inputStream) - throws IOException; - - StripeFooter readStripeFooter(List types, InputStream inputStream) - throws IOException; - - List readRowIndexes(InputStream inputStream) - throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java deleted file mode 100644 index 38bae8b8f2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java +++ /dev/null @@ -1,402 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import com.facebook.presto.hive.shaded.com.google.protobuf.CodedInputStream; -import com.google.common.base.Function; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Iterables; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; -import org.apache.hadoop.hive.ql.io.orc.OrcProto; -import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; - -import static com.google.common.base.Preconditions.checkState; -import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; - -public class OrcMetadataReader - implements MetadataReader -{ - @Override - public PostScript readPostScript(byte[] data, int offset, int length) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(data, offset, length); - OrcProto.PostScript postScript = OrcProto.PostScript.parseFrom(input); - - return new PostScript( - postScript.getVersionList(), - postScript.getFooterLength(), - postScript.getMetadataLength(), - toCompression(postScript.getCompression()), - postScript.getCompressionBlockSize()); - } - - @Override - public Metadata readMetadata(InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.Metadata metadata = OrcProto.Metadata.parseFrom(input); - return new Metadata(toStripeStatistics(metadata.getStripeStatsList())); - } - - private static List toStripeStatistics(List types) - { - return ImmutableList.copyOf(Iterables.transform(types, new Function() - { - @Override - public StripeStatistics apply(OrcProto.StripeStatistics type) - { - return toStripeStatistics(type); - } - })); - } - - private static StripeStatistics toStripeStatistics(OrcProto.StripeStatistics stripeStatistics) - { - return new StripeStatistics(toColumnStatistics(stripeStatistics.getColStatsList(), false)); - } - - @Override - public Footer readFooter(InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.Footer footer = OrcProto.Footer.parseFrom(input); - return new Footer( - footer.getNumberOfRows(), - footer.getRowIndexStride(), - toStripeInformation(footer.getStripesList()), - toType(footer.getTypesList()), - toColumnStatistics(footer.getStatisticsList(), false)); - } - - private static List toStripeInformation(List types) - { - return ImmutableList.copyOf(Iterables.transform(types, new Function() - { - @Override - public StripeInformation apply(OrcProto.StripeInformation type) - { - return toStripeInformation(type); - } - })); - } - - private static StripeInformation toStripeInformation(OrcProto.StripeInformation stripeInformation) - { - return new StripeInformation( - Ints.checkedCast(stripeInformation.getNumberOfRows()), - stripeInformation.getOffset(), - stripeInformation.getIndexLength(), - stripeInformation.getDataLength(), - stripeInformation.getFooterLength()); - } - - @Override - public StripeFooter readStripeFooter(List types, InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.StripeFooter stripeFooter = OrcProto.StripeFooter.parseFrom(input); - return new StripeFooter(toStream(stripeFooter.getStreamsList()), toColumnEncoding(stripeFooter.getColumnsList())); - } - - private static Stream toStream(OrcProto.Stream stream) - { - return new Stream(stream.getColumn(), toStreamKind(stream.getKind()), Ints.checkedCast(stream.getLength()), true); - } - - private static List toStream(List streams) - { - return ImmutableList.copyOf(Iterables.transform(streams, new Function() - { - @Override - public Stream apply(OrcProto.Stream stream) - { - return toStream(stream); - } - })); - } - - private static ColumnEncoding toColumnEncoding(OrcProto.ColumnEncoding columnEncoding) - { - return new ColumnEncoding(toColumnEncodingKind(columnEncoding.getKind()), columnEncoding.getDictionarySize()); - } - - private static List toColumnEncoding(List columnEncodings) - { - return ImmutableList.copyOf(Iterables.transform(columnEncodings, new Function() - { - @Override - public ColumnEncoding apply(OrcProto.ColumnEncoding columnEncoding) - { - return toColumnEncoding(columnEncoding); - } - })); - } - - @Override - public List readRowIndexes(InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.RowIndex rowIndex = OrcProto.RowIndex.parseFrom(input); - return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() - { - @Override - public RowGroupIndex apply(RowIndexEntry rowIndexEntry) - { - return toRowGroupIndex(rowIndexEntry); - } - })); - } - - private static RowGroupIndex toRowGroupIndex(RowIndexEntry rowIndexEntry) - { - List positionsList = rowIndexEntry.getPositionsList(); - ImmutableList.Builder positions = ImmutableList.builder(); - for (int index = 0; index < positionsList.size(); index++) { - long longPosition = positionsList.get(index); - int intPosition = (int) longPosition; - - checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index); - - positions.add(intPosition); - } - return new RowGroupIndex(positions.build(), toColumnStatistics(rowIndexEntry.getStatistics(), true)); - } - - private static ColumnStatistics toColumnStatistics(OrcProto.ColumnStatistics statistics, boolean isRowGroup) - { - return new ColumnStatistics( - statistics.getNumberOfValues(), - toBooleanStatistics(statistics.getBucketStatistics()), - toIntegerStatistics(statistics.getIntStatistics()), - toDoubleStatistics(statistics.getDoubleStatistics()), - toStringStatistics(statistics.getStringStatistics(), isRowGroup), - toDateStatistics(statistics.getDateStatistics(), isRowGroup)); - } - - private static List toColumnStatistics(List columnStatistics, final boolean isRowGroup) - { - if (columnStatistics == null) { - return ImmutableList.of(); - } - return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() - { - @Override - public ColumnStatistics apply(OrcProto.ColumnStatistics columnStatistics) - { - return toColumnStatistics(columnStatistics, isRowGroup); - } - })); - } - - private static BooleanStatistics toBooleanStatistics(OrcProto.BucketStatistics bucketStatistics) - { - if (bucketStatistics.getCountCount() == 0) { - return null; - } - - return new BooleanStatistics(bucketStatistics.getCount(0)); - } - - private static IntegerStatistics toIntegerStatistics(OrcProto.IntegerStatistics integerStatistics) - { - if (!integerStatistics.hasMinimum() && !integerStatistics.hasMaximum()) { - return null; - } - - return new IntegerStatistics( - integerStatistics.hasMinimum() ? integerStatistics.getMinimum() : null, - integerStatistics.hasMaximum() ? integerStatistics.getMaximum() : null); - } - - private static DoubleStatistics toDoubleStatistics(OrcProto.DoubleStatistics doubleStatistics) - { - if (!doubleStatistics.hasMinimum() && !doubleStatistics.hasMaximum()) { - return null; - } - - // TODO remove this when double statistics are changed to correctly deal with NaNs - // if either min or max is NaN, ignore the stat - if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) || - (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum()))) { - return null; - } - - return new DoubleStatistics( - doubleStatistics.hasMinimum() ? doubleStatistics.getMinimum() : null, - doubleStatistics.hasMaximum() ? doubleStatistics.getMaximum() : null); - } - - private static StringStatistics toStringStatistics(OrcProto.StringStatistics stringStatistics, boolean isRowGroup) - { - // TODO remove this when string statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 - if (!isRowGroup) { - return null; - } - - if (!stringStatistics.hasMinimum() && !stringStatistics.hasMaximum()) { - return null; - } - - return new StringStatistics( - stringStatistics.hasMinimum() ? stringStatistics.getMinimum() : null, - stringStatistics.hasMaximum() ? stringStatistics.getMaximum() : null); - } - - private static DateStatistics toDateStatistics(OrcProto.DateStatistics dateStatistics, boolean isRowGroup) - { - // TODO remove this when date statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 - if (!isRowGroup) { - return null; - } - - if (!dateStatistics.hasMinimum() && !dateStatistics.hasMaximum()) { - return null; - } - - // temporarily disable string statistics until we figure out the implications of how UTF-16 - // strings are compared when they contain surrogate pairs and replacement characters - if (true) { - return null; - } - - return new DateStatistics( - dateStatistics.hasMinimum() ? dateStatistics.getMinimum() : null, - dateStatistics.hasMaximum() ? dateStatistics.getMaximum() : null); - } - - private static OrcType toType(OrcProto.Type type) - { - return new OrcType(toTypeKind(type.getKind()), type.getSubtypesList(), type.getFieldNamesList()); - } - - private static List toType(List types) - { - return ImmutableList.copyOf(Iterables.transform(types, new Function() - { - @Override - public OrcType apply(OrcProto.Type type) - { - return toType(type); - } - })); - } - - private static OrcTypeKind toTypeKind(OrcProto.Type.Kind typeKind) - { - switch (typeKind) { - case BOOLEAN: - return OrcTypeKind.BOOLEAN; - case BYTE: - return OrcTypeKind.BYTE; - case SHORT: - return OrcTypeKind.SHORT; - case INT: - return OrcTypeKind.INT; - case LONG: - return OrcTypeKind.LONG; - case FLOAT: - return OrcTypeKind.FLOAT; - case DOUBLE: - return OrcTypeKind.DOUBLE; - case STRING: - return OrcTypeKind.STRING; - case BINARY: - return OrcTypeKind.BINARY; - case TIMESTAMP: - return OrcTypeKind.TIMESTAMP; - case LIST: - return OrcTypeKind.LIST; - case MAP: - return OrcTypeKind.MAP; - case STRUCT: - return OrcTypeKind.STRUCT; - case UNION: - return OrcTypeKind.UNION; - case DECIMAL: - return OrcTypeKind.DECIMAL; - case DATE: - return OrcTypeKind.DATE; - case VARCHAR: - return OrcTypeKind.VARCHAR; - case CHAR: - return OrcTypeKind.CHAR; - default: - throw new IllegalStateException(typeKind + " stream type not implemented yet"); - } - } - - private static StreamKind toStreamKind(OrcProto.Stream.Kind streamKind) - { - switch (streamKind) { - case PRESENT: - return StreamKind.PRESENT; - case DATA: - return StreamKind.DATA; - case LENGTH: - return StreamKind.LENGTH; - case DICTIONARY_DATA: - return StreamKind.DICTIONARY_DATA; - case DICTIONARY_COUNT: - return StreamKind.DICTIONARY_COUNT; - case SECONDARY: - return StreamKind.SECONDARY; - case ROW_INDEX: - return StreamKind.ROW_INDEX; - default: - throw new IllegalStateException(streamKind + " stream type not implemented yet"); - } - } - - private static ColumnEncodingKind toColumnEncodingKind(OrcProto.ColumnEncoding.Kind columnEncodingKind) - { - switch (columnEncodingKind) { - case DIRECT: - return ColumnEncodingKind.DIRECT; - case DIRECT_V2: - return ColumnEncodingKind.DIRECT_V2; - case DICTIONARY: - return ColumnEncodingKind.DICTIONARY; - case DICTIONARY_V2: - return ColumnEncodingKind.DICTIONARY_V2; - default: - throw new IllegalStateException(columnEncodingKind + " stream encoding not implemented yet"); - } - } - - private static CompressionKind toCompression(OrcProto.CompressionKind compression) - { - switch (compression) { - case NONE: - return UNCOMPRESSED; - case ZLIB: - return ZLIB; - case SNAPPY: - return SNAPPY; - default: - throw new IllegalStateException(compression + " compression not implemented yet"); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcType.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcType.java deleted file mode 100644 index 2690d626a6..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcType.java +++ /dev/null @@ -1,105 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import com.google.common.collect.ImmutableList; - -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; - -public class OrcType -{ - public enum OrcTypeKind - { - BOOLEAN, - - BYTE, - SHORT, - INT, - LONG, - DECIMAL, - - FLOAT, - DOUBLE, - - STRING, - VARCHAR, - CHAR, - - BINARY, - - DATE, - TIMESTAMP, - - LIST, - MAP, - STRUCT, - UNION, - } - - private final OrcTypeKind orcTypeKind; - private final List fieldTypeIndexes; - private final List fieldNames; - - public OrcType(OrcTypeKind orcTypeKind, List fieldTypeIndexes, List fieldNames) - { - this.orcTypeKind = checkNotNull(orcTypeKind, "typeKind is null"); - this.fieldTypeIndexes = ImmutableList.copyOf(checkNotNull(fieldTypeIndexes, "fieldTypeIndexes is null")); - if (fieldNames == null || (fieldNames.isEmpty() && !fieldTypeIndexes.isEmpty())) { - this.fieldNames = null; - } - else { - this.fieldNames = ImmutableList.copyOf(checkNotNull(fieldNames, "fieldNames is null")); - checkArgument(fieldNames.size() == fieldTypeIndexes.size(), "fieldNames and fieldTypeIndexes have different sizes"); - } - } - - public OrcTypeKind getOrcTypeKind() - { - return orcTypeKind; - } - - public int getFieldCount() - { - return fieldTypeIndexes.size(); - } - - public int getFieldTypeIndex(int field) - { - return fieldTypeIndexes.get(field); - } - - public String getFieldName(int field) - { - return fieldNames.get(field); - } - - public List getFieldNames() - { - return fieldNames; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("orcTypeKind", orcTypeKind) - .add("fieldTypeIndexes", fieldTypeIndexes) - .add("fieldNames", fieldNames) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/PostScript.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/PostScript.java deleted file mode 100644 index 89ea78a632..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/PostScript.java +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import com.google.common.collect.ImmutableList; - -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; - -public class PostScript -{ - private final List version; - private final long footerLength; - private final long metadataLength; - private final CompressionKind compression; - private final long compressionBlockSize; - - public PostScript(List version, long footerLength, long metadataLength, CompressionKind compression, long compressionBlockSize) - { - this.version = ImmutableList.copyOf(checkNotNull(version, "version is null")); - this.footerLength = footerLength; - this.metadataLength = metadataLength; - this.compression = checkNotNull(compression, "compressionKind is null"); - this.compressionBlockSize = compressionBlockSize; - } - - public List getVersion() - { - return version; - } - - public long getFooterLength() - { - return footerLength; - } - - public long getMetadataLength() - { - return metadataLength; - } - - public CompressionKind getCompression() - { - return compression; - } - - public long getCompressionBlockSize() - { - return compressionBlockSize; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("version", version) - .add("footerLength", footerLength) - .add("metadataLength", metadataLength) - .add("compressionKind", compression) - .add("compressionBlockSize", compressionBlockSize) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RangeStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RangeStatistics.java deleted file mode 100644 index 325003fb19..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RangeStatistics.java +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -public interface RangeStatistics -{ - T getMin(); - T getMax(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RowGroupIndex.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RowGroupIndex.java deleted file mode 100644 index 5691882e9d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/RowGroupIndex.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import com.google.common.collect.ImmutableList; - -import java.util.List; - -import static com.google.common.base.Preconditions.checkNotNull; - -public class RowGroupIndex -{ - private final List positions; - private final ColumnStatistics statistics; - - public RowGroupIndex(List positions, ColumnStatistics statistics) - { - this.positions = ImmutableList.copyOf(checkNotNull(positions, "positions is null")); - this.statistics = checkNotNull(statistics, "statistics is null"); - } - - public List getPositions() - { - return positions; - } - - public ColumnStatistics getColumnStatistics() - { - return statistics; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Stream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Stream.java deleted file mode 100644 index 3440143b0d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/Stream.java +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; - -public class Stream -{ - public enum StreamKind - { - PRESENT, - DATA, - LENGTH, - DICTIONARY_DATA, - DICTIONARY_COUNT, - SECONDARY, - ROW_INDEX, - IN_DICTIONARY, - ROW_GROUP_DICTIONARY, - ROW_GROUP_DICTIONARY_LENGTH, - } - - private final int column; - private final StreamKind streamKind; - private final int length; - private final boolean useVInts; - - public Stream(int column, StreamKind streamKind, int length, boolean useVInts) - { - this.column = column; - this.streamKind = checkNotNull(streamKind, "streamKind is null"); - this.length = length; - this.useVInts = useVInts; - } - - public int getColumn() - { - return column; - } - - public StreamKind getStreamKind() - { - return streamKind; - } - - public int getLength() - { - return length; - } - - public boolean isUseVInts() - { - return useVInts; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("column", column) - .add("streamKind", streamKind) - .add("length", length) - .add("useVInts", useVInts) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java deleted file mode 100644 index 17cb8ba289..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -public class StringStatistics - implements RangeStatistics -{ - private final String minimum; - private final String maximum; - - public StringStatistics(String minimum, String maximum) - { - this.minimum = minimum; - this.maximum = maximum; - } - - @Override - public String getMin() - { - return minimum; - } - - @Override - public String getMax() - { - return maximum; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeFooter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeFooter.java deleted file mode 100644 index 567ad0d7ad..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeFooter.java +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import com.google.common.collect.ImmutableList; - -import java.util.List; - -import static com.google.common.base.Preconditions.checkNotNull; - -public class StripeFooter -{ - private final List streams; - private final List columnEncodings; - - public StripeFooter(List streams, List columnEncodings) - { - this.streams = ImmutableList.copyOf(checkNotNull(streams, "streams is null")); - this.columnEncodings = ImmutableList.copyOf(checkNotNull(columnEncodings, "columnEncodings is null")); - } - - public List getColumnEncodings() - { - return columnEncodings; - } - - public List getStreams() - { - return streams; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeInformation.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeInformation.java deleted file mode 100644 index 5a5a55fe1a..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeInformation.java +++ /dev/null @@ -1,71 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import static com.google.common.base.MoreObjects.toStringHelper; - -public class StripeInformation -{ - private final int numberOfRows; - private final long offset; - private final long indexLength; - private final long dataLength; - private final long footerLength; - - public StripeInformation(int numberOfRows, long offset, long indexLength, long dataLength, long footerLength) - { - this.numberOfRows = numberOfRows; - this.offset = offset; - this.indexLength = indexLength; - this.dataLength = dataLength; - this.footerLength = footerLength; - } - - public int getNumberOfRows() - { - return numberOfRows; - } - - public long getOffset() - { - return offset; - } - - public long getIndexLength() - { - return indexLength; - } - - public long getDataLength() - { - return dataLength; - } - - public long getFooterLength() - { - return footerLength; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("numberOfRows", numberOfRows) - .add("offset", offset) - .add("indexLength", indexLength) - .add("dataLength", dataLength) - .add("footerLength", footerLength) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeStatistics.java deleted file mode 100644 index 5338325482..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StripeStatistics.java +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import com.google.common.collect.ImmutableList; - -import java.util.List; - -import static com.google.common.base.Preconditions.checkNotNull; - -public class StripeStatistics -{ - private final List columnStatistics; - - public StripeStatistics(List columnStatistics) - { - this.columnStatistics = ImmutableList.copyOf(checkNotNull(columnStatistics, "columnStatistics is null")); - } - - public List getColumnStatistics() - { - return columnStatistics; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java deleted file mode 100644 index cb38b2ed6e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.BooleanVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class BooleanStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream dataStream; - - private boolean rowGroupOpen; - - public BooleanStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - BooleanVector booleanVector = (BooleanVector) vector; - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(booleanVector.isNull, false); - dataStream.getSetBits(nextBatchSize, booleanVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, booleanVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.getSetBits(nextBatchSize, booleanVector.vector, booleanVector.isNull); - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(BooleanStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, BooleanStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java deleted file mode 100644 index 3688d2fce2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.ByteStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class ByteStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(ByteStream.class); - @Nullable - private ByteStream dataStream; - - private boolean rowGroupOpen; - - public ByteStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - LongVector byteVector = (LongVector) vector; - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(byteVector.isNull, false); - dataStream.nextVector(nextBatchSize, byteVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, byteVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextVector(nextBatchSize, byteVector.vector, byteVector.isNull); - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(ByteStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java deleted file mode 100644 index afca11996d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.DoubleVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.DoubleStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class DoubleStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(DoubleStream.class); - @Nullable - private DoubleStream dataStream; - - private boolean rowGroupOpen; - - public DoubleStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - DoubleVector doubleVector = (DoubleVector) vector; - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(doubleVector.isNull, false); - dataStream.nextVector(nextBatchSize, doubleVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, doubleVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextVector(nextBatchSize, doubleVector.vector, doubleVector.isNull); - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(DoubleStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, DoubleStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java deleted file mode 100644 index 8d75390337..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.DoubleVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.FloatStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class FloatStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(FloatStream.class); - @Nullable - private FloatStream dataStream; - - private boolean rowGroupOpen; - - public FloatStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - // we could add a float vector but Presto currently doesn't support floats - DoubleVector floatVector = (DoubleVector) vector; - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(floatVector.isNull, false); - dataStream.nextVector(nextBatchSize, floatVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, floatVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextVector(nextBatchSize, floatVector.vector, floatVector.isNull); - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(FloatStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, FloatStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java deleted file mode 100644 index 8048e61335..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.SliceVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.json.JsonReader; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import io.airlift.slice.DynamicSliceOutput; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class JsonStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - private final JsonReader jsonReader; - - private boolean stripeOpen; - private boolean rowGroupOpen; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; - - private int readOffset; - private int nextBatchSize; - - @Nullable - private StreamSources dictionaryStreamSources; - @Nullable - private StreamSources dataStreamSources; - - private List encoding; - - public JsonStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.jsonReader = createJsonReader(streamDescriptor, false, hiveStorageTimeZone); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the length reader - readOffset = presentStream.countBitsSet(readOffset); - } - - jsonReader.skip(readOffset); - } - - SliceVector sliceVector = (SliceVector) vector; - if (presentStream != null) { - presentStream.getUnsetBits(nextBatchSize, isNullVector); - } - - DynamicSliceOutput out = new DynamicSliceOutput(1024); - for (int i = 0; i < nextBatchSize; i++) { - if (!isNullVector[i]) { - out.reset(); - JsonGenerator generator = new JsonFactory().createGenerator(out); - jsonReader.readNextValueInto(generator); - sliceVector.vector[i] = out.copySlice(); - } - else { - sliceVector.vector[i] = null; - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - - if (!stripeOpen) { - jsonReader.openStripe(dictionaryStreamSources, encoding); - } - - jsonReader.openRowGroup(dataStreamSources); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - this.dictionaryStreamSources = dictionaryStreamSources; - this.dataStreamSources = null; - this.encoding = encoding; - - presentStreamSource = missingStreamSource(BooleanStream.class); - - stripeOpen = false; - rowGroupOpen = false; - - readOffset = 0; - nextBatchSize = 0; - - Arrays.fill(isNullVector, false); - - presentStream = null; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - this.dataStreamSources = dataStreamSources; - - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - - rowGroupOpen = false; - - readOffset = 0; - nextBatchSize = 0; - - Arrays.fill(isNullVector, false); - - presentStream = null; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java deleted file mode 100644 index bd847f6efd..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class LongDictionaryStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dictionaryDataStreamSource = missingStreamSource(LongStream.class); - private int dictionarySize; - @Nonnull - private long[] dictionary = new long[0]; - - @Nonnull - private StreamSource inDictionaryStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream inDictionaryStream; - private final boolean[] inDictionary = new boolean[Vector.MAX_VECTOR_LENGTH]; - - @Nonnull - private StreamSource dataStreamSource; - @Nullable - private LongStream dataStream; - - private boolean dictionaryOpen; - private boolean rowGroupOpen; - - public LongDictionaryStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the length reader - readOffset = presentStream.countBitsSet(readOffset); - } - - if (inDictionaryStream != null) { - inDictionaryStream.skip(readOffset); - } - - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - LongVector longVector = (LongVector) vector; - - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(longVector.isNull, false); - dataStream.nextLongVector(nextBatchSize, longVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); - } - } - - if (inDictionaryStream == null) { - Arrays.fill(inDictionary, true); - } - else { - inDictionaryStream.getSetBits(nextBatchSize, inDictionary, longVector.isNull); - } - - for (int i = 0; i < nextBatchSize; i++) { - if (!longVector.isNull[i]) { - if (inDictionary[i]) { - longVector.vector[i] = dictionary[((int) longVector.vector[i])]; - } - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - // read the dictionary - if (!dictionaryOpen && dictionarySize > 0) { - if (dictionary.length < dictionarySize) { - dictionary = new long[dictionarySize]; - } - - LongStream dictionaryStream = dictionaryDataStreamSource.openStream(); - verifyFormat(dictionaryStream != null, "Dictionary is not empty but data stream is not present"); - dictionaryStream.nextLongVector(dictionarySize, dictionary); - } - dictionaryOpen = true; - - presentStream = presentStreamSource.openStream(); - inDictionaryStream = inDictionaryStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - dictionaryDataStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, LongStream.class); - dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); - dictionaryOpen = false; - - inDictionaryStreamSource = missingStreamSource(BooleanStream.class); - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - inDictionaryStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - inDictionaryStreamSource = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - inDictionaryStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java deleted file mode 100644 index b50201cc0e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class LongDirectStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(LongStream.class); - @Nullable - private LongStream dataStream; - - private boolean rowGroupOpen; - - public LongDirectStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - LongVector longVector = (LongVector) vector; - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(longVector.isNull, false); - dataStream.nextLongVector(nextBatchSize, longVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java deleted file mode 100644 index 6943049acd..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; - -public class LongStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - private final LongDirectStreamReader directReader; - private final LongDictionaryStreamReader dictionaryReader; - private StreamReader currentReader; - - public LongStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - directReader = new LongDirectStreamReader(streamDescriptor); - dictionaryReader = new LongDictionaryStreamReader(streamDescriptor); - } - - @Override - public void prepareNextRead(int batchSize) - { - currentReader.prepareNextRead(batchSize); - } - - @Override - public void readBatch(Object vector) - throws IOException - { - currentReader.readBatch(vector); - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - ColumnEncodingKind kind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); - if (kind == DIRECT || kind == DIRECT_V2 || kind == DWRF_DIRECT) { - currentReader = directReader; - } - else if (kind == DICTIONARY) { - currentReader = dictionaryReader; - } - else { - throw new IllegalArgumentException("Unsupported encoding " + kind); - } - - currentReader.startStripe(dictionaryStreamSources, encoding); - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - currentReader.startRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java deleted file mode 100644 index bf7f362be5..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.SliceVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.*; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class SliceDictionaryStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; - - @Nonnull - private StreamSource dictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); - private boolean dictionaryOpen; - private int dictionarySize; - @Nonnull - private Slice[] dictionary = new Slice[0]; - - @Nonnull - private StreamSource dictionaryLengthStreamSource = missingStreamSource(LongStream.class); - @Nonnull - private int[] dictionaryLength = new int[0]; - - @Nonnull - private StreamSource inDictionaryStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream inDictionaryStream; - private final boolean[] inDictionary = new boolean[Vector.MAX_VECTOR_LENGTH]; - - @Nonnull - private StreamSource rowGroupDictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); - @Nonnull - private Slice[] rowGroupDictionary = new Slice[0]; - - @Nonnull - private StreamSource rowGroupDictionaryLengthStreamSource = missingStreamSource(RowGroupDictionaryLengthStream.class); - @Nonnull - private int[] rowGroupDictionaryLength = new int[0]; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(LongStream.class); - @Nullable - private LongStream dataStream; - @Nonnull - private final int[] dataVector = new int[Vector.MAX_VECTOR_LENGTH]; - - private boolean rowGroupOpen; - - public SliceDictionaryStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the length reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - if (inDictionaryStream != null) { - inDictionaryStream.skip(readOffset); - } - dataStream.skip(readOffset); - } - } - - SliceVector sliceVector = (SliceVector) vector; - - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(isNullVector, false); - dataStream.nextIntVector(nextBatchSize, dataVector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, isNullVector); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextIntVector(nextBatchSize, dataVector, isNullVector); - } - } - - if (inDictionaryStream == null) { - Arrays.fill(inDictionary, true); - } - else { - inDictionaryStream.getSetBits(nextBatchSize, inDictionary, isNullVector); - } - - for (int i = 0; i < nextBatchSize; i++) { - if (isNullVector[i]) { - sliceVector.vector[i] = null; - } - else if (inDictionary[i]) { - sliceVector.vector[i] = dictionary[dataVector[i]]; - } - else { - sliceVector.vector[i] = rowGroupDictionary[dataVector[i]]; - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - // read the dictionary - if (!dictionaryOpen && dictionarySize > 0) { - // resize the dictionary array if necessary - if (dictionary.length < dictionarySize) { - dictionary = new Slice[dictionarySize]; - dictionaryLength = new int[dictionarySize]; - } - - // read the lengths - LongStream lengthStream = dictionaryLengthStreamSource.openStream(); - verifyFormat(lengthStream != null, "Dictionary is not empty but dictionary length stream is not present"); - lengthStream.nextIntVector(dictionarySize, dictionaryLength); - - ByteArrayStream dictionaryDataStream = dictionaryDataStreamSource.openStream(); - readDictionary(dictionaryDataStream, dictionarySize, dictionaryLength, dictionary); - } - dictionaryOpen = true; - - // read row group dictionary - RowGroupDictionaryLengthStream dictionaryLengthStream = rowGroupDictionaryLengthStreamSource.openStream(); - if (dictionaryLengthStream != null) { - int rowGroupDictionarySize = dictionaryLengthStream.getEntryCount(); - - // resize the dictionary array if necessary - if (rowGroupDictionary.length < rowGroupDictionarySize) { - rowGroupDictionary = new Slice[rowGroupDictionarySize]; - rowGroupDictionaryLength = new int[rowGroupDictionarySize]; - } - - // read the lengths - dictionaryLengthStream.nextIntVector(rowGroupDictionarySize, rowGroupDictionaryLength); - - ByteArrayStream dictionaryDataStream = rowGroupDictionaryDataStreamSource.openStream(); - readDictionary(dictionaryDataStream, rowGroupDictionarySize, rowGroupDictionaryLength, rowGroupDictionary); - } - dictionaryOpen = true; - - presentStream = presentStreamSource.openStream(); - inDictionaryStream = inDictionaryStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - private static void readDictionary(@Nullable ByteArrayStream dictionaryDataStream, int dictionarySize, int[] dictionaryLength, Slice[] dictionary) - throws IOException - { - // sum lengths - int totalLength = 0; - for (int i = 0; i < dictionarySize; i++) { - totalLength += dictionaryLength[i]; - } - - // read dictionary data - byte[] dictionaryData = new byte[0]; - if (totalLength > 0) { - verifyFormat(dictionaryDataStream != null, "Dictionary length is not zero but dictionary data stream is not present"); - dictionaryData = dictionaryDataStream.next(totalLength); - } - - // build dictionary slices - int offset = 0; - for (int i = 0; i < dictionarySize; i++) { - int length = dictionaryLength[i]; - dictionary[i] = Slices.wrappedBuffer(dictionaryData, offset, length); - offset += length; - } - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - dictionaryDataStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, ByteArrayStream.class); - dictionaryLengthStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class); - dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); - dictionaryOpen = false; - - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(LongStream.class); - - inDictionaryStreamSource = missingStreamSource(BooleanStream.class); - rowGroupDictionaryLengthStreamSource = missingStreamSource(RowGroupDictionaryLengthStream.class); - rowGroupDictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - inDictionaryStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); - - // the "in dictionary" stream signals if the value is in the stripe or row group dictionary - inDictionaryStreamSource = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class); - rowGroupDictionaryLengthStreamSource = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY_LENGTH, RowGroupDictionaryLengthStream.class); - rowGroupDictionaryDataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY, ByteArrayStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - inDictionaryStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java deleted file mode 100644 index 994b25d29a..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.SliceVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.*; -import io.airlift.slice.Slices; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class SliceDirectStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; - - @Nonnull - private StreamSource lengthStreamSource = missingStreamSource(LongStream.class); - @Nullable - private LongStream lengthStream; - private final int[] lengthVector = new int[Vector.MAX_VECTOR_LENGTH]; - - @Nonnull - private StreamSource dataByteSource = missingStreamSource(ByteArrayStream.class); - @Nullable - private ByteArrayStream dataStream; - - private boolean rowGroupOpen; - - public SliceDirectStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the length reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - long dataSkipSize = lengthStream.sum(readOffset); - if (dataSkipSize > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(Ints.checkedCast(dataSkipSize)); - } - } - } - - SliceVector sliceVector = (SliceVector) vector; - if (presentStream == null) { - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - lengthStream.nextIntVector(nextBatchSize, lengthVector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, isNullVector); - if (nullValues != nextBatchSize) { - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - lengthStream.nextIntVector(nextBatchSize, lengthVector, isNullVector); - } - } - - int totalLength = 0; - for (int i = 0; i < nextBatchSize; i++) { - if (!isNullVector[i]) { - totalLength += lengthVector[i]; - } - } - - byte[] data = new byte[0]; - if (totalLength > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - data = dataStream.next(totalLength); - } - - int offset = 0; - for (int i = 0; i < nextBatchSize; i++) { - if (!isNullVector[i]) { - int length = lengthVector[i]; - sliceVector.vector[i] = Slices.wrappedBuffer(data, offset, length); - offset += length; - } - else { - sliceVector.vector[i] = null; - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - lengthStream = lengthStreamSource.openStream(); - dataStream = dataByteSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - lengthStreamSource = missingStreamSource(LongStream.class); - dataByteSource = missingStreamSource(ByteArrayStream.class); - - readOffset = 0; - nextBatchSize = 0; - - Arrays.fill(isNullVector, false); - - presentStream = null; - lengthStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - lengthStreamSource = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class); - dataByteSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteArrayStream.class); - - readOffset = 0; - nextBatchSize = 0; - - Arrays.fill(isNullVector, false); - - presentStream = null; - lengthStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java deleted file mode 100644 index e046dff632..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; - -public class SliceStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - private final SliceDirectStreamReader directReader; - private final SliceDictionaryStreamReader dictionaryReader; - private StreamReader currentReader; - - public SliceStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - directReader = new SliceDirectStreamReader(streamDescriptor); - dictionaryReader = new SliceDictionaryStreamReader(streamDescriptor); - } - - @Override - public void readBatch(Object vector) - throws IOException - { - currentReader.readBatch(vector); - } - - @Override - public void prepareNextRead(int batchSize) - { - currentReader.prepareNextRead(batchSize); - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - ColumnEncodingKind columnEncodingKind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); - if (columnEncodingKind == DIRECT || columnEncodingKind == DIRECT_V2 || columnEncodingKind == DWRF_DIRECT) { - currentReader = directReader; - } - else if (columnEncodingKind == DICTIONARY || columnEncodingKind == DICTIONARY_V2) { - currentReader = dictionaryReader; - } - else { - throw new IllegalArgumentException("Unsupported encoding " + columnEncodingKind); - } - - currentReader.startStripe(dictionaryStreamSources, encoding); - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - currentReader.startRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReader.java deleted file mode 100644 index e7ea384be5..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReader.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.io.IOException; -import java.util.List; - -public interface StreamReader -{ - void readBatch(Object vector) - throws IOException; - - void prepareNextRead(int batchSize); - - void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException; - - void startRowGroup(StreamSources dataStreamSources) - throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java deleted file mode 100644 index 7d0e8cc9f2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.joda.time.DateTimeZone; - -public final class StreamReaders -{ - private StreamReaders() - { - } - - public static StreamReader createStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) - { - switch (streamDescriptor.getStreamType()) { - case BOOLEAN: - return new BooleanStreamReader(streamDescriptor); - case BYTE: - return new ByteStreamReader(streamDescriptor); - case SHORT: - case INT: - case LONG: - case DATE: - return new LongStreamReader(streamDescriptor); - case FLOAT: - return new FloatStreamReader(streamDescriptor); - case DOUBLE: - return new DoubleStreamReader(streamDescriptor); - case BINARY: - case STRING: - return new SliceStreamReader(streamDescriptor); - case TIMESTAMP: - return new TimestampStreamReader(streamDescriptor, hiveStorageTimeZone); - case STRUCT: - case LIST: - case MAP: - return new JsonStreamReader(streamDescriptor, hiveStorageTimeZone); - case UNION: - case DECIMAL: - case VARCHAR: - case CHAR: - default: - throw new IllegalArgumentException("Unsupported type: " + streamDescriptor.getStreamType()); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java deleted file mode 100644 index ba96f7cdcb..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class TimestampStreamReader - implements StreamReader -{ - private static final int MILLIS_PER_SECOND = 1000; - - private final StreamDescriptor streamDescriptor; - private final long baseTimestampInSeconds; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource secondsStreamSource = missingStreamSource(LongStream.class); - @Nullable - private LongStream secondsStream; - - @Nonnull - private StreamSource nanosStreamSource = missingStreamSource(LongStream.class); - @Nullable - private LongStream nanosStream; - - private final long[] nanosVector = new long[Vector.MAX_VECTOR_LENGTH]; - - private boolean rowGroupOpen; - - public TimestampStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.baseTimestampInSeconds = new DateTime(2015, 1, 1, 0, 0, checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null")).getMillis() / MILLIS_PER_SECOND; - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - secondsStream.skip(readOffset); - nanosStream.skip(readOffset); - } - } - - LongVector longVector = (LongVector) vector; - if (presentStream == null) { - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - Arrays.fill(longVector.isNull, false); - secondsStream.nextLongVector(nextBatchSize, longVector.vector); - nanosStream.nextLongVector(nextBatchSize, nanosVector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - secondsStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); - nanosStream.nextLongVector(nextBatchSize, nanosVector, longVector.isNull); - } - } - - // merge seconds and nanos together - for (int i = 0; i < nextBatchSize; i++) { - longVector.vector[i] = decodeTimestamp(longVector.vector[i], nanosVector[i], baseTimestampInSeconds); - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - secondsStream = secondsStreamSource.openStream(); - nanosStream = nanosStreamSource.openStream(); - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - secondsStreamSource = missingStreamSource(LongStream.class); - nanosStreamSource = missingStreamSource(LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - secondsStream = null; - nanosStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - secondsStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); - nanosStreamSource = dataStreamSources.getStreamSource(streamDescriptor, SECONDARY, LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - secondsStream = null; - nanosStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } - - // This comes from the Apache Hive ORC code - public static long decodeTimestamp(long seconds, long serializedNanos, long baseTimestampInSeconds) - { - long millis = (seconds + baseTimestampInSeconds) * MILLIS_PER_SECOND; - long nanos = parseNanos(serializedNanos); - - // the rounding error exists because java always rounds up when dividing integers - // -42001/1000 = -42; and -42001 % 1000 = -1 (+ 1000) - // to get the correct value we need - // (-42 - 1)*1000 + 999 = -42001 - // (42)*1000 + 1 = 42001 - if (millis < 0 && nanos != 0) { - millis -= 1000; - } - // Truncate nanos to millis and add to mills - return millis + (nanos / 1000000); - } - - // This comes from the Apache Hive ORC code - private static int parseNanos(long serialized) - { - int zeros = ((int) serialized) & 0x7; // 0b111 - int result = (int) (serialized >>> 3); - if (zeros != 0) { - for (int i = 0; i <= zeros; ++i) { - result *= 10; - } - } - return result; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/BooleanStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/BooleanStream.java deleted file mode 100644 index 6d7322e45a..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/BooleanStream.java +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.checkpoint.BooleanStreamCheckpoint; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkState; - -public class BooleanStream - implements ValueStream -{ - private static final int HIGH_BIT_MASK = 0x80; // was 0b1000_0000 for JDK 7 - private final ByteStream byteStream; - private byte data; - private int bitsInData; - - public BooleanStream(OrcInputStream byteStream) - { - this.byteStream = new ByteStream(byteStream); - } - - private void readByte() - throws IOException - { - checkState(bitsInData == 0); - data = byteStream.next(); - bitsInData = 8; - } - - public boolean nextBit() - throws IOException - { - // read more data if necessary - if (bitsInData == 0) { - readByte(); - } - - // read bit - boolean result = (data & HIGH_BIT_MASK) != 0; - - // mark bit consumed - data <<= 1; - bitsInData--; - - return result; - } - - @Override - public Class getCheckpointType() - { - return BooleanStreamCheckpoint.class; - } - - @Override - public void seekToCheckpoint(BooleanStreamCheckpoint checkpoint) - throws IOException - { - byteStream.seekToCheckpoint(checkpoint.getByteStreamCheckpoint()); - bitsInData = 0; - skip(checkpoint.getOffset()); - } - - @Override - public void skip(int items) - throws IOException - { - if (bitsInData >= items) { - data <<= items; - bitsInData -= items; - } - else { - items -= bitsInData; - bitsInData = 0; - - byteStream.skip(items >>> 3); - items = items & 0x07; // 0b111; - - if (items != 0) { - readByte(); - data <<= items; - bitsInData -= items; - } - } - } - - public int countBitsSet(int items) - throws IOException - { - int count = 0; - - // count buffered data - if (items > bitsInData && bitsInData > 0) { - count += bitCount(data); - items -= bitsInData; - bitsInData = 0; - } - - // count whole bytes - while (items > 8) { - count += bitCount(byteStream.next()); - items -= 8; - } - - // count remaining bits - for (int i = 0; i < items; i++) { - // read more data if necessary - if (bitsInData == 0) { - readByte(); - } - - // read bit - if ((data & HIGH_BIT_MASK) != 0) { - count++; - } - - // mark bit consumed - data <<= 1; - bitsInData--; - } - - return count; - } - - /** - * Sets the vector element to true if the bit is set. - */ - public void getSetBits(int batchSize, boolean[] vector) - throws IOException - { - for (int i = 0; i < batchSize; i++) { - // read more data if necessary - if (bitsInData == 0) { - readByte(); - } - - // read bit - vector[i] = (data & HIGH_BIT_MASK) != 0; - - // mark bit consumed - data <<= 1; - bitsInData--; - } - } - - /** - * Sets the vector element to true if the bit is set, skipping the null values. - */ - public void getSetBits(int batchSize, boolean[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < batchSize; i++) { - if (!isNull[i]) { - // read more data if necessary - if (bitsInData == 0) { - readByte(); - } - - // read bit - vector[i] = (data & HIGH_BIT_MASK) != 0; - - // mark bit consumed - data <<= 1; - bitsInData--; - } - } - } - - /** - * Sets the vector element to true if the bit is not set. - */ - public int getUnsetBits(int batchSize, boolean[] vector) - throws IOException - { - int count = 0; - for (int i = 0; i < batchSize; i++) { - // read more data if necessary - if (bitsInData == 0) { - readByte(); - } - - // read bit - vector[i] = (data & HIGH_BIT_MASK) == 0; - if (vector[i]) { - count++; - } - - // mark bit consumed - data <<= 1; - bitsInData--; - } - return count; - } - - private static int bitCount(byte data) - { - return Integer.bitCount(data & 0xFF); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java deleted file mode 100644 index 853609af56..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.checkpoint.ByteArrayStreamCheckpoint; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; - -public class ByteArrayStream - implements ValueStream -{ - private final OrcInputStream inputStream; - - public ByteArrayStream(OrcInputStream inputStream) - { - this.inputStream = checkNotNull(inputStream, "inputStream is null"); - } - - public byte[] next(int length) - throws IOException - { - byte[] data = new byte[length]; - readFully(inputStream, data, 0, length); - return data; - } - - public void next(int length, byte[] data) - throws IOException - { - readFully(inputStream, data, 0, length); - } - - @Override - public Class getCheckpointType() - { - return ByteArrayStreamCheckpoint.class; - } - - @Override - public void seekToCheckpoint(ByteArrayStreamCheckpoint checkpoint) - throws IOException - { - inputStream.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); - } - - @Override - public void skip(int skipSize) - throws IOException - { - skipFully(inputStream, skipSize); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java deleted file mode 100644 index adb27cbeb9..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.checkpoint.ByteStreamCheckpoint; - -import java.io.IOException; -import java.util.Arrays; - -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; - -public class ByteStream - implements ValueStream -{ - private final OrcInputStream input; - private final byte[] buffer = new byte[MIN_REPEAT_SIZE + 127]; - private int length; - private int offset; - private long lastReadInputCheckpoint; - - public ByteStream(OrcInputStream input) - { - this.input = input; - lastReadInputCheckpoint = input.getCheckpoint(); - } - - // This is based on the Apache Hive ORC code - private void readNextBlock() - throws IOException - { - lastReadInputCheckpoint = input.getCheckpoint(); - - int control = input.read(); - verifyFormat(control != -1, "Read past end of buffer RLE byte from %s", input); - - offset = 0; - - // if byte high bit is not set, this is a repetition; otherwise it is a literal sequence - if ((control & 0x80) == 0) { - length = control + MIN_REPEAT_SIZE; - - // read the repeated value - int value = input.read(); - verifyFormat(value != -1, "Reading RLE byte got EOF"); - - // fill buffer with the value - Arrays.fill(buffer, 0, length, (byte) value); - } - else { - // length is 2's complement of byte - length = 0x100 - control; - - // read the literals into the buffer - readFully(input, buffer, 0, length); - } - } - - @Override - public Class getCheckpointType() - { - return ByteStreamCheckpoint.class; - } - - @Override - public void seekToCheckpoint(ByteStreamCheckpoint checkpoint) - throws IOException - { - // if the checkpoint is within the current buffer, just adjust the pointer - if (lastReadInputCheckpoint == checkpoint.getInputStreamCheckpoint() && checkpoint.getOffset() <= length) { - offset = checkpoint.getOffset(); - } - else { - // otherwise, discard the buffer and start over - input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); - length = 0; - offset = 0; - skip(checkpoint.getOffset()); - } - } - - @Override - public void skip(int items) - throws IOException - { - while (items > 0) { - if (offset == length) { - readNextBlock(); - } - long consume = Math.min(items, length - offset); - offset += consume; - items -= consume; - } - } - - public byte next() - throws IOException - { - if (offset == length) { - readNextBlock(); - } - return buffer[offset++]; - } - - public void nextVector(long items, long[] vector) - throws IOException - { - for (int i = 0; i < items; i++) { - vector[i] = next(); - } - } - - public void nextVector(long items, long[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = next(); - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java deleted file mode 100644 index 6c3e5ea6c9..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; - -import javax.annotation.Nullable; -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkNotNull; - -public class CheckpointStreamSource, C extends StreamCheckpoint> - implements StreamSource -{ - public static , C extends StreamCheckpoint> CheckpointStreamSource createCheckpointStreamSource(S stream, StreamCheckpoint checkpoint) - { - checkNotNull(stream, "stream is null"); - checkNotNull(checkpoint, "checkpoint is null"); - - Class checkpointType = stream.getCheckpointType(); - C verifiedCheckpoint = OrcStreamUtils.checkType(checkpoint, checkpointType, "Checkpoint"); - return new CheckpointStreamSource(stream, verifiedCheckpoint); - } - - private final S stream; - private final C checkpoint; - - public CheckpointStreamSource(S stream, C checkpoint) - { - this.stream = checkNotNull(stream, "stream is null"); - this.checkpoint = checkNotNull(checkpoint, "checkpoint is null"); - } - - @Override - public Class getStreamType() - { - return (Class) stream.getClass(); - } - - @Nullable - @Override - public S openStream() - throws IOException - { - stream.seekToCheckpoint(checkpoint); - return stream; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("stream", stream) - .add("checkpoint", checkpoint) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java deleted file mode 100644 index 08f1f160e2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.DoubleStreamCheckpoint; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkPositionIndex; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; -import static io.airlift.slice.SizeOf.SIZE_OF_DOUBLE; - -public class DoubleStream - implements ValueStream -{ - private final OrcInputStream input; - private final byte[] buffer = new byte[Vector.MAX_VECTOR_LENGTH * SIZE_OF_DOUBLE]; - private final Slice slice = Slices.wrappedBuffer(buffer); - - public DoubleStream(OrcInputStream input) - { - this.input = input; - } - - @Override - public Class getCheckpointType() - { - return DoubleStreamCheckpoint.class; - } - - @Override - public void seekToCheckpoint(DoubleStreamCheckpoint checkpoint) - throws IOException - { - input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); - } - - @Override - public void skip(int items) - throws IOException - { - long length = items * SIZE_OF_DOUBLE; - skipFully(input, length); - } - - public double next() - throws IOException - { - readFully(input, buffer, 0, SIZE_OF_DOUBLE); - return slice.getDouble(0); - } - - public void nextVector(int items, double[] vector) - throws IOException - { - checkPositionIndex(items, vector.length); - checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); - - // buffer that number of values - readFully(input, buffer, 0, items * SIZE_OF_DOUBLE); - - // copy values directly into vector - Slices.wrappedDoubleArray(vector).setBytes(0, slice, 0, items * SIZE_OF_DOUBLE); - } - - public void nextVector(long items, double[] vector, boolean[] isNull) - throws IOException - { - // count the number of non nulls - int notNullCount = 0; - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - notNullCount++; - } - } - - // buffer that umber of values - readFully(input, buffer, 0, notNullCount * SIZE_OF_DOUBLE); - - // load them into the buffer - int elementIndex = 0; - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = slice.getDouble(elementIndex); - elementIndex += SIZE_OF_DOUBLE; - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java deleted file mode 100644 index 722c9470fd..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.FloatStreamCheckpoint; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkPositionIndex; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; -import static io.airlift.slice.SizeOf.SIZE_OF_FLOAT; - -public class FloatStream - implements ValueStream -{ - private final OrcInputStream input; - private final byte[] buffer = new byte[Vector.MAX_VECTOR_LENGTH * SIZE_OF_FLOAT]; - private final Slice slice = Slices.wrappedBuffer(buffer); - - public FloatStream(OrcInputStream input) - { - this.input = input; - } - - @Override - public Class getCheckpointType() - { - return FloatStreamCheckpoint.class; - } - - @Override - public void seekToCheckpoint(FloatStreamCheckpoint checkpoint) - throws IOException - { - input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); - } - - @Override - public void skip(int items) - throws IOException - { - long length = items * SIZE_OF_FLOAT; - skipFully(input, length); - } - - public float next() - throws IOException - { - readFully(input, buffer, 0, SIZE_OF_FLOAT); - return slice.getFloat(0); - } - - public void nextVector(int items, double[] vector) - throws IOException - { - checkPositionIndex(items, vector.length); - checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); - - // buffer that number of values - readFully(input, buffer, 0, items * SIZE_OF_FLOAT); - - // load them into the buffer one at a time since we are reading - // floats into a double vector - int elementIndex = 0; - for (int i = 0; i < items; i++) { - vector[i] = slice.getFloat(elementIndex); - elementIndex += SIZE_OF_FLOAT; - } - } - - public void nextVector(long items, double[] vector, boolean[] isNull) - throws IOException - { - // count the number of non nulls - int notNullCount = 0; - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - notNullCount++; - } - } - - // buffer that umber of values - readFully(input, buffer, 0, notNullCount * SIZE_OF_FLOAT); - - // load them into the buffer - int elementIndex = 0; - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = slice.getFloat(elementIndex); - elementIndex += SIZE_OF_FLOAT; - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java deleted file mode 100644 index 40753bfe75..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; - -import java.io.IOException; -import java.io.InputStream; - -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.FixedBitSizes.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.FixedBitSizes.*; - -// This is based on the Apache Hive ORC code -public final class LongDecode -{ - private LongDecode() - { - } - - enum FixedBitSizes - { - ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, - THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, - TWENTY, TWENTY_ONE, TWENTY_TWO, TWENTY_THREE, TWENTY_FOUR, TWENTY_SIX, - TWENTY_EIGHT, THIRTY, THIRTY_TWO, FORTY, FORTY_EIGHT, FIFTY_SIX, SIXTY_FOUR; - } - - /** - * Decodes the ordinal fixed bit value to actual fixed bit width value. - */ - public static int decodeBitWidth(int n) - { - if (n >= ONE.ordinal() && n <= TWENTY_FOUR.ordinal()) { - return n + 1; - } - else if (n == TWENTY_SIX.ordinal()) { - return 26; - } - else if (n == TWENTY_EIGHT.ordinal()) { - return 28; - } - else if (n == THIRTY.ordinal()) { - return 30; - } - else if (n == THIRTY_TWO.ordinal()) { - return 32; - } - else if (n == FORTY.ordinal()) { - return 40; - } - else if (n == FORTY_EIGHT.ordinal()) { - return 48; - } - else if (n == FIFTY_SIX.ordinal()) { - return 56; - } - else { - return 64; - } - } - - /** - * Gets the closest supported fixed bit width for the specified bit width. - */ - public static int getClosestFixedBits(int width) - { - if (width == 0) { - return 1; - } - - if (width >= 1 && width <= 24) { - return width; - } - else if (width > 24 && width <= 26) { - return 26; - } - else if (width > 26 && width <= 28) { - return 28; - } - else if (width > 28 && width <= 30) { - return 30; - } - else if (width > 30 && width <= 32) { - return 32; - } - else if (width > 32 && width <= 40) { - return 40; - } - else if (width > 40 && width <= 48) { - return 48; - } - else if (width > 48 && width <= 56) { - return 56; - } - else { - return 64; - } - } - - public static long readSignedVInt(InputStream inputStream) - throws IOException - { - long result = readUnsignedVInt(inputStream); - return (result >>> 1) ^ -(result & 1); - } - - public static long readUnsignedVInt(InputStream inputStream) - throws IOException - { - long result = 0; - int offset = 0; - long b; - do { - b = inputStream.read(); - verifyFormat(b != -1, "EOF while reading unsigned vint"); - result |= (b & 0x7F /* 0b0111_1111 */) << offset; - offset += 7; - } while ((b & 0x80 /* 0b1000_0000 */) != 0); - return result; - } - - public static long readVInt(boolean signed, InputStream inputStream) - throws IOException - { - if (signed) { - return readSignedVInt(inputStream); - } - else { - return readUnsignedVInt(inputStream); - } - } - - public static long zigzagDecode(long value) - { - return (value >>> 1) ^ -(value & 1); - } - - public static long readDwrfLong(InputStream input, OrcTypeKind type, boolean signed, boolean usesVInt) - throws IOException - { - if (usesVInt) { - return readVInt(signed, input); - } - else if (type == SHORT) { - return input.read() | (input.read() << 8); - } - else if (type == INT) { - return input.read() | (input.read() << 8) | (input.read() << 16) | (input.read() << 24); - } - else if (type == LONG) { - return ((long) input.read()) | - (((long) input.read()) << 8) | - (((long) input.read()) << 16) | - (((long) input.read()) << 24) | - (((long) input.read()) << 32) | - (((long) input.read()) << 40) | - (((long) input.read()) << 48) | - (((long) input.read()) << 56); - } - else { - throw new IllegalArgumentException(type + " type is not supported"); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStream.java deleted file mode 100644 index a6042dc978..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStream.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; - -import java.io.IOException; - -public interface LongStream - extends ValueStream -{ - long next() - throws IOException; - - void nextIntVector(int items, int[] vector) - throws IOException; - - void nextIntVector(int items, int[] vector, boolean[] isNull) - throws IOException; - - void nextLongVector(int items, long[] vector) - throws IOException; - - void nextLongVector(int items, long[] vector, boolean[] isNull) - throws IOException; - - long sum(int items) - throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java deleted file mode 100644 index e037be6c3e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamDwrfCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkPositionIndex; -import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.readDwrfLong; - -public class LongStreamDwrf - implements LongStream -{ - private final OrcInputStream input; - private final OrcTypeKind orcTypeKind; - private final boolean signed; - private final boolean usesVInt; - - public LongStreamDwrf(OrcInputStream input, OrcTypeKind type, boolean signed, boolean usesVInt) - { - this.input = input; - this.orcTypeKind = type; - this.signed = signed; - this.usesVInt = usesVInt; - } - - @Override - public Class getCheckpointType() - { - return LongStreamDwrfCheckpoint.class; - } - - @Override - public void seekToCheckpoint(LongStreamCheckpoint checkpoint) - throws IOException - { - LongStreamDwrfCheckpoint dwrfCheckpoint = OrcStreamUtils.checkType(checkpoint, LongStreamDwrfCheckpoint.class, "Checkpoint"); - input.seekToCheckpoint(dwrfCheckpoint.getInputStreamCheckpoint()); - } - - @Override - public void skip(int items) - throws IOException - { - // there is no fast way to skip values - for (int i = 0; i < items; i++) { - next(); - } - } - - @Override - public long sum(int items) - throws IOException - { - long sum = 0; - for (int i = 0; i < items; i++) { - sum += next(); - } - return sum; - } - - @Override - public long next() - throws IOException - { - return readDwrfLong(input, orcTypeKind, signed, usesVInt); - } - - @Override - public void nextIntVector(int items, int[] vector) - throws IOException - { - checkPositionIndex(items, vector.length); - checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); - - for (int i = 0; i < items; i++) { - vector[i] = Ints.checkedCast(next()); - } - } - - @Override - public void nextIntVector(int items, int[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = Ints.checkedCast(next()); - } - } - } - - @Override - public void nextLongVector(int items, long[] vector) - throws IOException - { - checkPositionIndex(items, vector.length); - - for (int i = 0; i < items; i++) { - vector[i] = next(); - } - } - - @Override - public void nextLongVector(int items, long[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = next(); - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java deleted file mode 100644 index 29a6d25ef6..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamV1Checkpoint; - -import java.io.IOException; - -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; - -public class LongStreamV1 - implements LongStream -{ - private static final int MAX_LITERAL_SIZE = 128; - - private final OrcInputStream input; - private final boolean signed; - private final long[] literals = new long[MAX_LITERAL_SIZE]; - private int numLiterals; - private int delta; - private int used; - private boolean repeat; - private long lastReadInputCheckpoint; - - public LongStreamV1(OrcInputStream input, boolean signed) - { - this.input = input; - this.signed = signed; - lastReadInputCheckpoint = input.getCheckpoint(); - } - - // This comes from the Apache Hive ORC code - private void readValues() - throws IOException - { - lastReadInputCheckpoint = input.getCheckpoint(); - - int control = input.read(); - verifyFormat(control != -1, "Read past end of RLE integer from %s", input); - - if (control < 0x80) { - numLiterals = control + MIN_REPEAT_SIZE; - used = 0; - repeat = true; - delta = input.read(); - verifyFormat(delta != -1, "End of stream in RLE Integer from %s", input); - - // convert from 0 to 255 to -128 to 127 by converting to a signed byte - // noinspection SillyAssignment - delta = (byte) delta; - literals[0] = LongDecode.readVInt(signed, input); - } - else { - numLiterals = 0x100 - control; - used = 0; - repeat = false; - for (int i = 0; i < numLiterals; ++i) { - literals[i] = LongDecode.readVInt(signed, input); - } - } - } - - @Override - // This comes from the Apache Hive ORC code - public long next() - throws IOException - { - long result; - if (used == numLiterals) { - readValues(); - } - if (repeat) { - result = literals[0] + (used++) * delta; - } - else { - result = literals[used++]; - } - return result; - } - - @Override - public Class getCheckpointType() - { - return LongStreamV1Checkpoint.class; - } - - @Override - public void seekToCheckpoint(LongStreamCheckpoint checkpoint) - throws IOException - { - LongStreamV1Checkpoint v1Checkpoint = OrcStreamUtils.checkType(checkpoint, LongStreamV1Checkpoint.class, "Checkpoint"); - - // if the checkpoint is within the current buffer, just adjust the pointer - if (lastReadInputCheckpoint == v1Checkpoint.getInputStreamCheckpoint() && v1Checkpoint.getOffset() <= numLiterals) { - used = v1Checkpoint.getOffset(); - } - else { - // otherwise, discard the buffer and start over - input.seekToCheckpoint(v1Checkpoint.getInputStreamCheckpoint()); - numLiterals = 0; - used = 0; - skip(v1Checkpoint.getOffset()); - } - } - - @Override - public void skip(int items) - throws IOException - { - while (items > 0) { - if (used == numLiterals) { - readValues(); - } - long consume = Math.min(items, numLiterals - used); - used += consume; - items -= consume; - } - } - - @Override - public long sum(int items) - throws IOException - { - long sum = 0; - for (int i = 0; i < items; i++) { - sum += next(); - } - return sum; - } - - @Override - public void nextLongVector(int items, long[] vector) - throws IOException - { - for (int i = 0; i < items; i++) { - vector[i] = next(); - } - } - - @Override - public void nextLongVector(int items, long[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = next(); - } - } - } - - @Override - public void nextIntVector(int items, int[] vector) - throws IOException - { - for (int i = 0; i < items; i++) { - vector[i] = Ints.checkedCast(next()); - } - } - - @Override - public void nextIntVector(int items, int[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = Ints.checkedCast(next()); - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java deleted file mode 100644 index f22b3681d2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java +++ /dev/null @@ -1,452 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamV2Checkpoint; - -import java.io.IOException; -import java.io.InputStream; - -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; - -/** - * @see {@link org.apache.hadoop.hive.ql.io.orc.RunLengthIntegerWriterV2} for description of various lightweight compression techniques. - */ -// This comes from the Apache Hive ORC code -public class LongStreamV2 - implements LongStream -{ - private static final int MAX_LITERAL_SIZE = 512; - - private enum EncodingType - { - SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA - } - - private final OrcInputStream input; - private final boolean signed; - private final long[] literals = new long[MAX_LITERAL_SIZE]; - private int numLiterals; - private int used; - private final boolean skipCorrupt; - private long lastReadInputCheckpoint; - - public LongStreamV2(OrcInputStream input, boolean signed, boolean skipCorrupt) - { - this.input = input; - this.signed = signed; - this.skipCorrupt = skipCorrupt; - lastReadInputCheckpoint = input.getCheckpoint(); - } - - // This comes from the Apache Hive ORC code - private void readValues() - throws IOException - { - lastReadInputCheckpoint = input.getCheckpoint(); - - // read the first 2 bits and determine the encoding type - int firstByte = input.read(); - verifyFormat(firstByte >= 0, "Read past end of RLE integer from %s", input); - - int enc = (firstByte >>> 6) & 0x03; - if (EncodingType.SHORT_REPEAT.ordinal() == enc) { - readShortRepeatValues(firstByte); - } - else if (EncodingType.DIRECT.ordinal() == enc) { - readDirectValues(firstByte); - } - else if (EncodingType.PATCHED_BASE.ordinal() == enc) { - readPatchedBaseValues(firstByte); - } - else { - readDeltaValues(firstByte); - } - } - - // This comes from the Apache Hive ORC code - private void readDeltaValues(int firstByte) - throws IOException - { - // extract the number of fixed bits - int fixedBits = (firstByte >>> 1) & 0x1f; - if (fixedBits != 0) { - fixedBits = LongDecode.decodeBitWidth(fixedBits); - } - - // extract the blob run length - int length = (firstByte & 0x01) << 8; - length |= input.read(); - - // read the first value stored as vint - long firstVal = LongDecode.readVInt(signed, input); - - // store first value to result buffer - literals[numLiterals++] = firstVal; - - // if fixed bits is 0 then all values have fixed delta - long prevVal; - if (fixedBits == 0) { - // read the fixed delta value stored as vint (deltas can be negative even - // if all number are positive) - long fixedDelta = LongDecode.readSignedVInt(input); - - // add fixed deltas to adjacent values - for (int i = 0; i < length; i++) { - literals[numLiterals++] = literals[numLiterals - 2] + fixedDelta; - } - } - else { - long deltaBase = LongDecode.readSignedVInt(input); - // add delta base and first value - literals[numLiterals++] = firstVal + deltaBase; - prevVal = literals[numLiterals - 1]; - length -= 1; - - // write the unpacked values, add it to previous value and store final - // value to result buffer. if the delta base value is negative then it - // is a decreasing sequence else an increasing sequence - readBitPackedLongs(literals, numLiterals, length, fixedBits, input); - while (length > 0) { - if (deltaBase < 0) { - literals[numLiterals] = prevVal - literals[numLiterals]; - } - else { - literals[numLiterals] = prevVal + literals[numLiterals]; - } - prevVal = literals[numLiterals]; - length--; - numLiterals++; - } - } - } - - // This comes from the Apache Hive ORC code - private void readPatchedBaseValues(int firstByte) - throws IOException - { - // extract the number of fixed bits - int fb = LongDecode.decodeBitWidth((firstByte >>> 1) & 0x1F); // 0b1_1111 - - // extract the run length of data blob - int length = (firstByte & 0x01) << 8; - length |= input.read(); - // runs are always one off - length += 1; - - // extract the number of bytes occupied by base - int thirdByte = input.read(); - int baseWidth = (thirdByte >>> 5) & 0x07; // 0b0111 - // base width is one off - baseWidth += 1; - - // extract patch width - int patchWidth = LongDecode.decodeBitWidth(thirdByte & 0x1F); // 0b1_1111 - - // read fourth byte and extract patch gap width - int fourthByte = input.read(); - int patchGapWidth = (fourthByte >>> 5) & 0x07; // 0b0111 - // patch gap width is one off - patchGapWidth += 1; - - // extract the length of the patch list - int patchListLength = fourthByte & 0x1F; // 0b1_1111 - - // read the next base width number of bytes to extract base value - long base = bytesToLongBE(input, baseWidth); - long mask = (1L << ((baseWidth * 8) - 1)); - // if MSB of base value is 1 then base is negative value else positive - if ((base & mask) != 0) { - base = base & ~mask; - base = -base; - } - - // unpack the data blob - long[] unpacked = new long[length]; - readBitPackedLongs(unpacked, 0, length, fb, input); - - // unpack the patch blob - long[] unpackedPatch = new long[patchListLength]; - - verifyFormat((patchWidth + patchGapWidth) <= 64 || skipCorrupt, "ORC file is corrupt"); - - int bitSize = LongDecode.getClosestFixedBits(patchWidth + patchGapWidth); - readBitPackedLongs(unpackedPatch, 0, patchListLength, bitSize, input); - - // apply the patch directly when decoding the packed data - int patchIndex = 0; - long currentGap; - long currentPatch; - long patchMask = ((1L << patchWidth) - 1); - currentGap = unpackedPatch[patchIndex] >>> patchWidth; - currentPatch = unpackedPatch[patchIndex] & patchMask; - long actualGap = 0; - - // special case: gap is >255 then patch value will be 0. - // if gap is <=255 then patch value cannot be 0 - while (currentGap == 255 && currentPatch == 0) { - actualGap += 255; - patchIndex++; - currentGap = unpackedPatch[patchIndex] >>> patchWidth; - currentPatch = unpackedPatch[patchIndex] & patchMask; - } - // add the left over gap - actualGap += currentGap; - - // unpack data blob, patch it (if required), add base to get final result - for (int i = 0; i < unpacked.length; i++) { - if (i == actualGap) { - // extract the patch value - long patchedValue = unpacked[i] | (currentPatch << fb); - - // add base to patched value - literals[numLiterals++] = base + patchedValue; - - // increment the patch to point to next entry in patch list - patchIndex++; - - if (patchIndex < patchListLength) { - // read the next gap and patch - currentGap = unpackedPatch[patchIndex] >>> patchWidth; - currentPatch = unpackedPatch[patchIndex] & patchMask; - actualGap = 0; - - // special case: gap is >255 then patch will be 0. if gap is - // <=255 then patch cannot be 0 - while (currentGap == 255 && currentPatch == 0) { - actualGap += 255; - patchIndex++; - currentGap = unpackedPatch[patchIndex] >>> patchWidth; - currentPatch = unpackedPatch[patchIndex] & patchMask; - } - // add the left over gap - actualGap += currentGap; - - // next gap is relative to the current gap - actualGap += i; - } - } - else { - // no patching required. add base to unpacked value to get final value - literals[numLiterals++] = base + unpacked[i]; - } - } - - } - - // This comes from the Apache Hive ORC code - private void readDirectValues(int firstByte) - throws IOException - { - // extract the number of fixed bits - int fixedBits = LongDecode.decodeBitWidth((firstByte >>> 1) & 0x1F); // 0b1_1111 - - // extract the run length - int length = (firstByte & 0x01) << 8; - length |= input.read(); - // runs are one off - length += 1; - - // write the unpacked values and zigzag decode to result buffer - readBitPackedLongs(literals, numLiterals, length, fixedBits, input); - if (signed) { - for (int i = 0; i < length; i++) { - literals[numLiterals] = LongDecode.zigzagDecode(literals[numLiterals]); - numLiterals++; - } - } - else { - numLiterals += length; - } - } - - // This comes from the Apache Hive ORC code - private void readShortRepeatValues(int firstByte) - throws IOException - { - // read the number of bytes occupied by the value - int size = (firstByte >>> 3) & 0x07; // 0b0111 - // #bytes are one off - size += 1; - - // read the run length - int length = firstByte & 0x07; - // run lengths values are stored only after MIN_REPEAT value is met - length += MIN_REPEAT_SIZE; - - // read the repeated value which is store using fixed bytes - long val = bytesToLongBE(input, size); - - if (signed) { - val = LongDecode.zigzagDecode(val); - } - - // repeat the value for length times - for (int i = 0; i < length; i++) { - literals[numLiterals++] = val; - } - } - - // This comes from the Apache Hive ORC code - private static void readBitPackedLongs(long[] buffer, int offset, int len, int bitSize, InputStream input) - throws IOException - { - int bitsLeft = 0; - int current = 0; - - for (int i = offset; i < (offset + len); i++) { - long result = 0; - int bitsLeftToRead = bitSize; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= current & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - current = input.read(); - bitsLeft = 8; - } - - // handle the left over bits - if (bitsLeftToRead > 0) { - result <<= bitsLeftToRead; - bitsLeft -= bitsLeftToRead; - result |= (current >> bitsLeft) & ((1 << bitsLeftToRead) - 1); - } - buffer[i] = result; - } - } - - /** - * Read n bytes in big endian order and convert to long. - */ - private static long bytesToLongBE(InputStream input, int n) - throws IOException - { - long out = 0; - long val; - while (n > 0) { - n--; - // store it in a long and then shift else integer overflow will occur - val = input.read(); - out |= (val << (n * 8)); - } - return out; - } - - @Override - public long next() - throws IOException - { - if (used == numLiterals) { - numLiterals = 0; - used = 0; - readValues(); - } - return literals[used++]; - } - - @Override - public Class getCheckpointType() - { - return LongStreamV2Checkpoint.class; - } - - @Override - public void seekToCheckpoint(LongStreamCheckpoint checkpoint) - throws IOException - { - LongStreamV2Checkpoint v2Checkpoint = OrcStreamUtils.checkType(checkpoint, LongStreamV2Checkpoint.class, "Checkpoint"); - - // if the checkpoint is within the current buffer, just adjust the pointer - if (lastReadInputCheckpoint == v2Checkpoint.getInputStreamCheckpoint() && v2Checkpoint.getOffset() <= numLiterals) { - used = v2Checkpoint.getOffset(); - } - else { - // otherwise, discard the buffer and start over - input.seekToCheckpoint(v2Checkpoint.getInputStreamCheckpoint()); - numLiterals = 0; - used = 0; - skip(v2Checkpoint.getOffset()); - } - } - - @Override - public void skip(int items) - throws IOException - { - while (items > 0) { - if (used == numLiterals) { - numLiterals = 0; - used = 0; - readValues(); - } - long consume = Math.min(items, numLiterals - used); - used += consume; - items -= consume; - } - } - - @Override - public long sum(int items) - throws IOException - { - long sum = 0; - for (int i = 0; i < items; i++) { - sum += next(); - } - return sum; - } - - @Override - public void nextLongVector(int items, long[] vector) - throws IOException - { - for (int i = 0; i < items; i++) { - vector[i] = next(); - } - } - - @Override - public void nextLongVector(int items, long[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = next(); - } - } - } - - @Override - public void nextIntVector(int items, int[] vector) - throws IOException - { - for (int i = 0; i < items; i++) { - vector[i] = Ints.checkedCast(next()); - } - } - - @Override - public void nextIntVector(int items, int[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = Ints.checkedCast(next()); - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/MissingStreamSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/MissingStreamSource.java deleted file mode 100644 index 3c9adcefb8..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/MissingStreamSource.java +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import javax.annotation.Nullable; -import java.io.IOException; - -public class MissingStreamSource> implements StreamSource -{ - private final Class streamType; - - public static > StreamSource missingStreamSource(Class streamType) - { - return new MissingStreamSource(streamType); - } - - private MissingStreamSource(Class streamType) - { - this.streamType = streamType; - } - - @Override - public Class getStreamType() - { - return streamType; - } - - @Nullable - @Override - public S openStream() - throws IOException - { - return null; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java deleted file mode 100644 index 54472236d8..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import io.airlift.slice.BasicSliceInput; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; -import org.iq80.snappy.Snappy; - -import java.io.IOException; -import java.io.InputStream; -import java.util.zip.DataFormatException; -import java.util.zip.Inflater; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.*; -import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; -import static io.airlift.slice.Slices.EMPTY_SLICE; -import static sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET; - -public final class OrcInputStream - extends InputStream -{ - public static final int BLOCK_HEADER_SIZE = 3; - - private final String source; - private final BasicSliceInput compressedSliceInput; - private final CompressionKind compressionKind; - private final int bufferSize; - - private int currentCompressedBlockOffset; - private BasicSliceInput current; - - private Slice buffer; - - public OrcInputStream(String source, BasicSliceInput sliceInput, CompressionKind compressionKind, int bufferSize) - { - this.source = checkNotNull(source, "source is null"); - - checkNotNull(sliceInput, "sliceInput is null"); - - this.compressionKind = checkNotNull(compressionKind, "compressionKind is null"); - this.bufferSize = bufferSize; - - if (compressionKind == UNCOMPRESSED) { - this.current = sliceInput; - this.compressedSliceInput = EMPTY_SLICE.getInput(); - } - else { - checkArgument(compressionKind == SNAPPY || compressionKind == ZLIB, "%s compression not supported", compressionKind); - this.compressedSliceInput = checkNotNull(sliceInput, "compressedSliceInput is null"); - this.current = EMPTY_SLICE.getInput(); - } - } - - @Override - public void close() - throws IOException - { - current = null; - } - - @Override - public int available() - throws IOException - { - if (current == null) { - return 0; - } - return current.available(); - } - - @Override - public boolean markSupported() - { - return false; - } - - @Override - public int read() - throws IOException - { - if (current == null) { - return -1; - } - - int result = current.read(); - if (result != -1) { - return result; - } - - advance(); - return read(); - } - - @Override - public int read(byte[] b, int off, int length) - throws IOException - { - if (current == null) { - return -1; - } - - if (!current.isReadable()) { - advance(); - if (current == null) { - return -1; - } - } - - return current.read(b, off, length); - } - - public long getCheckpoint() - { - // if the decompressed buffer is empty, return a checkpoint starting at the next block - if (current == null || (current.position() == 0 && current.available() == 0)) { - return createInputStreamCheckpoint(compressedSliceInput.position(), 0); - } - // otherwise return a checkpoint at the last compressed block read and the current position in the buffer - return createInputStreamCheckpoint(currentCompressedBlockOffset, current.position()); - } - - public boolean seekToCheckpoint(long checkpoint) - throws IOException - { - int compressedBlockOffset = decodeCompressedBlockOffset(checkpoint); - int decompressedOffset = decodeDecompressedOffset(checkpoint); - boolean discardedBuffer; - if (compressedBlockOffset != currentCompressedBlockOffset) { - verifyFormat(compressionKind != UNCOMPRESSED, "Reset stream has a compressed block offset but stream is not compressed"); - compressedSliceInput.setPosition(compressedBlockOffset); - current = EMPTY_SLICE.getInput(); - discardedBuffer = true; - } - else { - discardedBuffer = false; - } - - if (decompressedOffset != current.position()) { - current.setPosition(0); - if (current.available() < decompressedOffset) { - decompressedOffset -= current.available(); - advance(); - } - current.setPosition(decompressedOffset); - } - return discardedBuffer; - } - - @Override - public long skip(long n) - throws IOException - { - if (current == null || n <= 0) { - return -1; - } - - long result = current.skip(n); - if (result != 0) { - return result; - } - if (read() == -1) { - return 0; - } - return 1 + current.skip(n - 1); - } - - // This comes from the Apache Hive ORC code - private void advance() - throws IOException - { - if (compressedSliceInput == null || compressedSliceInput.available() == 0) { - current = null; - return; - } - - // 3 byte header - // NOTE: this must match BLOCK_HEADER_SIZE - currentCompressedBlockOffset = compressedSliceInput.position(); - int b0 = compressedSliceInput.readUnsignedByte(); - int b1 = compressedSliceInput.readUnsignedByte(); - int b2 = compressedSliceInput.readUnsignedByte(); - - boolean isUncompressed = (b0 & 0x01) == 1; - int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >>> 1); - - Slice chunk = compressedSliceInput.readSlice(chunkLength); - - if (isUncompressed) { - current = chunk.getInput(); - } - else { - if (buffer == null) { - buffer = Slices.allocate(bufferSize); - } - - int uncompressedSize; - if (compressionKind == ZLIB) { - uncompressedSize = decompressZip(chunk, buffer); - } - else { - uncompressedSize = decompressSnappy(chunk, buffer); - } - - current = buffer.slice(0, uncompressedSize).getInput(); - } - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("source", source) - .add("compressedOffset", compressedSliceInput.position()) - .add("uncompressedOffset", current == null ? null : current.position()) - .add("compression", compressionKind) - .toString(); - } - - // This comes from the Apache Hive ORC code - private static int decompressZip(Slice in, Slice buffer) - throws IOException - { - byte[] outArray = (byte[]) buffer.getBase(); - int outOffset = 0; - - byte[] inArray = (byte[]) in.getBase(); - int inOffset = (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET); - int inLength = in.length(); - - Inflater inflater = new Inflater(true); - inflater.setInput(inArray, inOffset, inLength); - while (!(inflater.finished() || inflater.needsDictionary() || inflater.needsInput())) { - try { - int count = inflater.inflate(outArray, outOffset, outArray.length - outOffset); - outOffset += count; - } - catch (DataFormatException e) { - throw new OrcCorruptionException(e, "Invalid compressed stream"); - } - } - inflater.end(); - return outOffset; - } - - private static int decompressSnappy(Slice in, Slice buffer) - throws IOException - { - byte[] outArray = (byte[]) buffer.getBase(); - - byte[] inArray = (byte[]) in.getBase(); - int inOffset = (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET); - int inLength = in.length(); - - return Snappy.uncompress(inArray, inOffset, inLength, outArray, 0); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java deleted file mode 100644 index 2f04155d6c..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import java.io.IOException; -import java.io.InputStream; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; - -final class OrcStreamUtils -{ - public static final int MIN_REPEAT_SIZE = 3; - - private OrcStreamUtils() - { - } - - public static void skipFully(InputStream input, long length) - throws IOException - { - while (length > 0) { - long result = input.skip(length); - verifyFormat(result >= 0, "Unexpected end of stream"); - length -= result; - } - } - - public static void readFully(InputStream input, byte[] buffer, int offset, int length) - throws IOException - { - while (offset < length) { - int result = input.read(buffer, offset, length - offset); - verifyFormat(result >= 0, "Unexpected end of stream"); - offset += result; - } - } - - static B checkType(A value, Class target, String name) - { - checkNotNull(value, "%s is null", name); - checkArgument(target.isInstance(value), - "%s must be of type %s, not %s", - name, - target.getName(), - value.getClass().getName()); - return target.cast(value); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/RowGroupDictionaryLengthStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/RowGroupDictionaryLengthStream.java deleted file mode 100644 index 5cfc097aa8..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/RowGroupDictionaryLengthStream.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.RowGroupDictionaryLengthStreamCheckpoint; - -import java.io.IOException; - -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.checkType; - -public class RowGroupDictionaryLengthStream - extends LongStreamV1 -{ - private int entryCount = -1; - - public RowGroupDictionaryLengthStream(OrcInputStream input, boolean signed) - { - super(input, signed); - } - - public int getEntryCount() - { - return entryCount; - } - - @Override - public Class getCheckpointType() - { - return RowGroupDictionaryLengthStreamCheckpoint.class; - } - - @Override - public void seekToCheckpoint(LongStreamCheckpoint checkpoint) - throws IOException - { - super.seekToCheckpoint(checkpoint); - RowGroupDictionaryLengthStreamCheckpoint rowGroupDictionaryLengthStreamCheckpoint = checkType(checkpoint, RowGroupDictionaryLengthStreamCheckpoint.class, "Checkpoint"); - entryCount = rowGroupDictionaryLengthStreamCheckpoint.getRowGroupDictionarySize(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSource.java deleted file mode 100644 index 4aba1e3000..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSource.java +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import javax.annotation.Nullable; -import java.io.IOException; - -public interface StreamSource> -{ - Class getStreamType(); - - @Nullable - S openStream() - throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java deleted file mode 100644 index e03dbbbae1..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.collect.ImmutableMap; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.StreamId; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; - -import javax.annotation.Nonnull; -import java.util.Map; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class StreamSources -{ - private final Map> streamSources; - - public StreamSources(Map> streamSources) - { - this.streamSources = ImmutableMap.copyOf(checkNotNull(streamSources, "streamSources is null")); - } - - @Nonnull - public > StreamSource getStreamSource(StreamDescriptor streamDescriptor, StreamKind streamKind, Class streamType) - { - checkNotNull(streamDescriptor, "streamDescriptor is null"); - checkNotNull(streamType, "streamType is null"); - - StreamSource streamSource = streamSources.get(new StreamId(streamDescriptor.getStreamId(), streamKind)); - if (streamSource == null) { - streamSource = missingStreamSource(streamType); - } - - checkArgument(streamType.isAssignableFrom(streamSource.getStreamType()), - "%s must be of type %s, not %s", - streamDescriptor, - streamType.getName(), - streamSource.getStreamType().getName()); - - return (StreamSource) streamSource; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStream.java deleted file mode 100644 index dd15397187..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStream.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; - -import java.io.IOException; - -public interface ValueStream -{ - Class getCheckpointType(); - - void seekToCheckpoint(C checkpoint) - throws IOException; - - void skip(int items) - throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreams.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreams.java deleted file mode 100644 index 4953473369..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ValueStreams.java +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.StreamId; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; - -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; -import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind.INT; -import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind.TIMESTAMP; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; - -public final class ValueStreams -{ - private ValueStreams() - { - } - - public static ValueStream createValueStreams( - StreamId streamId, - OrcInputStream inputStream, - OrcTypeKind type, - ColumnEncodingKind encoding, - boolean usesVInt) - { - if (streamId.getStreamKind() == PRESENT) { - return new BooleanStream(inputStream); - } - - // dictionary length and data streams are unsigned int streams - if ((encoding == DICTIONARY || encoding == DICTIONARY_V2) && (streamId.getStreamKind() == LENGTH || streamId.getStreamKind() == DATA)) { - return createLongStream(inputStream, encoding, INT, false, usesVInt); - } - - if (streamId.getStreamKind() == DATA) { - switch (type) { - case BOOLEAN: - return new BooleanStream(inputStream); - case BYTE: - return new ByteStream(inputStream); - case SHORT: - case INT: - case LONG: - case DATE: - return createLongStream(inputStream, encoding, type, true, usesVInt); - case FLOAT: - return new FloatStream(inputStream); - case DOUBLE: - return new DoubleStream(inputStream); - case STRING: - case BINARY: - return new ByteArrayStream(inputStream); - case TIMESTAMP: - return createLongStream(inputStream, encoding, type, true, usesVInt); - } - } - - // length stream of a direct encoded string or binary column - if (streamId.getStreamKind() == LENGTH) { - switch (type) { - case STRING: - case BINARY: - case MAP: - case LIST: - return createLongStream(inputStream, encoding, type, false, usesVInt); - } - } - - // length stream of a the row group dictionary - if (streamId.getStreamKind() == ROW_GROUP_DICTIONARY_LENGTH) { - switch (type) { - case STRING: - case BINARY: - return new RowGroupDictionaryLengthStream(inputStream, false); - } - } - - // row group dictionary - if (streamId.getStreamKind() == ROW_GROUP_DICTIONARY) { - switch (type) { - case STRING: - case BINARY: - return new ByteArrayStream(inputStream); - } - } - - // row group dictionary - if (streamId.getStreamKind() == IN_DICTIONARY) { - return new BooleanStream(inputStream); - } - - // length (nanos) of a timestamp column - if (type == TIMESTAMP && streamId.getStreamKind() == SECONDARY) { - return createLongStream(inputStream, encoding, type, false, usesVInt); - } - - if (streamId.getStreamKind() == DICTIONARY_DATA) { - switch (type) { - case SHORT: - case INT: - case LONG: - return createLongStream(inputStream, DWRF_DIRECT, INT, true, usesVInt); - case STRING: - case VARCHAR: - case CHAR: - case BINARY: - return new ByteArrayStream(inputStream); - } - } - - throw new IllegalArgumentException(String.format("Unsupported column type %s for stream %s with encoding %s", type, streamId, encoding)); - } - - private static ValueStream createLongStream( - OrcInputStream inputStream, - ColumnEncodingKind encoding, - OrcTypeKind type, - boolean signed, - boolean usesVInt) - { - if (encoding == DIRECT_V2 || encoding == DICTIONARY_V2) { - return new LongStreamV2(inputStream, signed, false); - } - else if (encoding == DIRECT || encoding == DICTIONARY) { - return new LongStreamV1(inputStream, signed); - } - else if (encoding == DWRF_DIRECT) { - return new LongStreamDwrf(inputStream, type, signed, usesVInt); - } - else { - throw new IllegalArgumentException("Unsupported encoding for long stream: " + encoding); - } - } -} From c97e200bf1abf7cbb9855a19fc1cfe340316c58b Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Mon, 25 May 2015 15:51:33 -0700 Subject: [PATCH 060/141] Recover the version to maven artifact. --- pom.xml | 2 +- tajo-algebra/pom.xml | 2 +- tajo-catalog/pom.xml | 2 +- tajo-catalog/tajo-catalog-client/pom.xml | 2 +- tajo-catalog/tajo-catalog-common/pom.xml | 2 +- tajo-catalog/tajo-catalog-drivers/pom.xml | 2 +- tajo-catalog/tajo-catalog-server/pom.xml | 2 +- tajo-cli/pom.xml | 2 +- tajo-client/pom.xml | 2 +- tajo-common/pom.xml | 2 +- tajo-core/pom.xml | 2 +- tajo-dist/pom.xml | 2 +- tajo-jdbc/pom.xml | 2 +- tajo-maven-plugins/pom.xml | 2 +- tajo-plan/pom.xml | 2 +- tajo-project/pom.xml | 4 ++-- tajo-pullserver/pom.xml | 2 +- tajo-rpc/pom.xml | 2 +- tajo-storage/pom.xml | 2 +- tajo-storage/tajo-storage-common/pom.xml | 2 +- tajo-storage/tajo-storage-hbase/pom.xml | 2 +- tajo-storage/tajo-storage-hdfs/pom.xml | 2 +- tajo-thirdparty/asm/pom.xml | 2 +- 23 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pom.xml b/pom.xml index 1f0c5ff5e0..8ad9d3bd59 100644 --- a/pom.xml +++ b/pom.xml @@ -26,7 +26,7 @@ org.apache.tajo tajo-main - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT Tajo Main Tajo Main pom diff --git a/tajo-algebra/pom.xml b/tajo-algebra/pom.xml index 95151eb06d..66a86436fb 100644 --- a/tajo-algebra/pom.xml +++ b/tajo-algebra/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-algebra diff --git a/tajo-catalog/pom.xml b/tajo-catalog/pom.xml index c722368699..3745785cc8 100644 --- a/tajo-catalog/pom.xml +++ b/tajo-catalog/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-client/pom.xml b/tajo-catalog/tajo-catalog-client/pom.xml index 3be6587470..46db4f2acc 100644 --- a/tajo-catalog/tajo-catalog-client/pom.xml +++ b/tajo-catalog/tajo-catalog-client/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-common/pom.xml b/tajo-catalog/tajo-catalog-common/pom.xml index 7af058151b..2622b895ca 100644 --- a/tajo-catalog/tajo-catalog-common/pom.xml +++ b/tajo-catalog/tajo-catalog-common/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-drivers/pom.xml b/tajo-catalog/tajo-catalog-drivers/pom.xml index 00a7ebd108..221a2b4404 100644 --- a/tajo-catalog/tajo-catalog-drivers/pom.xml +++ b/tajo-catalog/tajo-catalog-drivers/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-catalog/tajo-catalog-server/pom.xml b/tajo-catalog/tajo-catalog-server/pom.xml index 27c58a54b5..2c5a4bd7cb 100644 --- a/tajo-catalog/tajo-catalog-server/pom.xml +++ b/tajo-catalog/tajo-catalog-server/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-cli/pom.xml b/tajo-cli/pom.xml index b955681bde..ae45673c0a 100644 --- a/tajo-cli/pom.xml +++ b/tajo-cli/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-cli diff --git a/tajo-client/pom.xml b/tajo-client/pom.xml index 08f75fee0d..a1ffc97f7f 100644 --- a/tajo-client/pom.xml +++ b/tajo-client/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-client diff --git a/tajo-common/pom.xml b/tajo-common/pom.xml index 475e47f87e..58b4b589d5 100644 --- a/tajo-common/pom.xml +++ b/tajo-common/pom.xml @@ -17,7 +17,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project diff --git a/tajo-core/pom.xml b/tajo-core/pom.xml index 1d4dc583b5..bc11d1c0c9 100644 --- a/tajo-core/pom.xml +++ b/tajo-core/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-core diff --git a/tajo-dist/pom.xml b/tajo-dist/pom.xml index b465c6f991..bf7c8146ee 100644 --- a/tajo-dist/pom.xml +++ b/tajo-dist/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-jdbc/pom.xml b/tajo-jdbc/pom.xml index b9e6930e8c..0db314a0c4 100644 --- a/tajo-jdbc/pom.xml +++ b/tajo-jdbc/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-jdbc diff --git a/tajo-maven-plugins/pom.xml b/tajo-maven-plugins/pom.xml index 543e0f05eb..fe0dc9e7c1 100644 --- a/tajo-maven-plugins/pom.xml +++ b/tajo-maven-plugins/pom.xml @@ -17,7 +17,7 @@ org.apache.tajo tajo-project - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project org.apache.tajo diff --git a/tajo-plan/pom.xml b/tajo-plan/pom.xml index cf5c4171b1..d72d8953af 100644 --- a/tajo-plan/pom.xml +++ b/tajo-plan/pom.xml @@ -22,7 +22,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project tajo-plan diff --git a/tajo-project/pom.xml b/tajo-project/pom.xml index d0b2a9c407..d08cb9d27f 100644 --- a/tajo-project/pom.xml +++ b/tajo-project/pom.xml @@ -22,7 +22,7 @@ org.apache.tajo tajo-main - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT org.apache.tajo tajo-project @@ -35,7 +35,7 @@ UTF-8 2.6.0 2.5.0 - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT 0.98.7-hadoop2 4.0.25.Final ${project.parent.relativePath}/.. diff --git a/tajo-pullserver/pom.xml b/tajo-pullserver/pom.xml index 516b16495a..71aa5fc0ac 100644 --- a/tajo-pullserver/pom.xml +++ b/tajo-pullserver/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-rpc/pom.xml b/tajo-rpc/pom.xml index fc57c0dff2..7061722e02 100644 --- a/tajo-rpc/pom.xml +++ b/tajo-rpc/pom.xml @@ -20,7 +20,7 @@ 4.0.0 tajo-project - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT org.apache.tajo ../tajo-project diff --git a/tajo-storage/pom.xml b/tajo-storage/pom.xml index faa1aaae10..58a010fa7a 100644 --- a/tajo-storage/pom.xml +++ b/tajo-storage/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../tajo-project 4.0.0 diff --git a/tajo-storage/tajo-storage-common/pom.xml b/tajo-storage/tajo-storage-common/pom.xml index 60b886a04c..570180a841 100644 --- a/tajo-storage/tajo-storage-common/pom.xml +++ b/tajo-storage/tajo-storage-common/pom.xml @@ -21,7 +21,7 @@ limitations under the License. tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-storage/tajo-storage-hbase/pom.xml b/tajo-storage/tajo-storage-hbase/pom.xml index 79a8b99b89..3a7d5d93fc 100644 --- a/tajo-storage/tajo-storage-hbase/pom.xml +++ b/tajo-storage/tajo-storage-hbase/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index b89bf4d6b0..05afba362d 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -21,7 +21,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project 4.0.0 diff --git a/tajo-thirdparty/asm/pom.xml b/tajo-thirdparty/asm/pom.xml index 1483611554..a085866adb 100644 --- a/tajo-thirdparty/asm/pom.xml +++ b/tajo-thirdparty/asm/pom.xml @@ -17,7 +17,7 @@ tajo-project org.apache.tajo - 0.10.2-SNAPSHOT + 0.10.1-SNAPSHOT ../../tajo-project From bba9fa5027df5b8f6e8c607a13017d89b44eeeb4 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Tue, 26 May 2015 14:01:52 +0900 Subject: [PATCH 061/141] Remove useless maven dependancies --- tajo-storage/tajo-storage-hdfs/pom.xml | 63 -------------------------- 1 file changed, 63 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index d6e1cb728b..7db8a0f353 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -354,74 +354,11 @@ net.minidev json-smart - - io.airlift - slice - 0.7 - - - io.airlift - units - 0.97 - - - com.google.guava - guava - 18.0 - - - joda-time - joda-time - 2.4 - - - org.iq80.snappy - snappy - 0.2 - - - com.facebook.presto.hive - hive-apache - 0.9 - - - org.jetbrains - annotations - 13.0 - - - com.fasterxml.jackson.core - jackson-core - 2.4.2 - com.facebook.presto presto-orc 0.86 - - com.facebook.hive - hive-dwrf - 0.8 - - - commons-logging - commons-logging - - - org.iq80.snappy - snappy - - - com.facebook.presto.hadoop - hadoop-cdh4 - - - it.unimi.dsi - fastutil - - - From 985f61d6b1c71a7b261a5f91496fde104b61d708 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Tue, 26 May 2015 16:13:28 +0900 Subject: [PATCH 062/141] TAJO-1619: JDBC program is stuck after closing. (jihoon) --- CHANGES | 2 ++ .../org/apache/tajo/client/SessionConnection.java | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/CHANGES b/CHANGES index 50b735c296..5baaa53442 100644 --- a/CHANGES +++ b/CHANGES @@ -40,6 +40,8 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1619: JDBC program is stuck after closing. (jihoon) + TAJO-1612: TestKillQuery occassionally fails. (hyunsik) TAJO-1440: Some tests fail in parallel test environment in TestKillQuery. diff --git a/tajo-client/src/main/java/org/apache/tajo/client/SessionConnection.java b/tajo-client/src/main/java/org/apache/tajo/client/SessionConnection.java index b0cc662225..187af339b8 100644 --- a/tajo-client/src/main/java/org/apache/tajo/client/SessionConnection.java +++ b/tajo-client/src/main/java/org/apache/tajo/client/SessionConnection.java @@ -30,9 +30,11 @@ import org.apache.tajo.ipc.ClientProtos.SessionUpdateResponse; import org.apache.tajo.ipc.TajoMasterClientProtocol; import org.apache.tajo.rpc.NettyClientBase; +import org.apache.tajo.rpc.RpcChannelFactory; import org.apache.tajo.rpc.RpcClientManager; import org.apache.tajo.rpc.ServerCallable; import org.apache.tajo.service.ServiceTracker; +import org.apache.tajo.util.CommonTestingUtil; import org.apache.tajo.util.KeyValueSet; import org.apache.tajo.util.ProtoUtil; @@ -46,6 +48,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicInteger; import static org.apache.tajo.ipc.ClientProtos.CreateSessionRequest; import static org.apache.tajo.ipc.ClientProtos.CreateSessionResponse; @@ -55,6 +58,8 @@ public class SessionConnection implements Closeable { private final Log LOG = LogFactory.getLog(TajoClientImpl.class); + private final static AtomicInteger connections = new AtomicInteger(); + final RpcClientManager manager; private final String baseDatabase; @@ -91,6 +96,7 @@ public SessionConnection(ServiceTracker tracker, @Nullable String baseDatabase, this.baseDatabase = baseDatabase != null ? baseDatabase : null; this.serviceTracker = tracker; + connections.incrementAndGet(); } public Map getClientSideSessionVars() { @@ -287,6 +293,14 @@ public void close() { // ignore } finally { RpcClientManager.cleanup(client); + if(connections.decrementAndGet() == 0) { + if (!System.getProperty(CommonTestingUtil.TAJO_TEST_KEY, "FALSE").equals(CommonTestingUtil.TAJO_TEST_TRUE)) { + RpcChannelFactory.shutdownGracefully(); + if (LOG.isDebugEnabled()) { + LOG.debug("RPC connection is closed"); + } + } + } } } From 808d0e782a33ca2b2488facf0890418d6e2fac05 Mon Sep 17 00:00:00 2001 From: Jinho Kim Date: Wed, 27 May 2015 10:51:48 +0900 Subject: [PATCH 063/141] TAJO-1621: Compilation error with hadoop 2.7.0. (jinho) --- CHANGES | 2 + tajo-core/pom.xml | 1 + .../apache/tajo/engine/utils/ThreadUtil.java | 149 ------------------ .../apache/tajo/ha/HdfsServiceTracker.java | 13 +- .../org/apache/tajo/TajoTestingCluster.java | 1 + 5 files changed, 11 insertions(+), 155 deletions(-) delete mode 100644 tajo-core/src/main/java/org/apache/tajo/engine/utils/ThreadUtil.java diff --git a/CHANGES b/CHANGES index 5baaa53442..4603f3ed62 100644 --- a/CHANGES +++ b/CHANGES @@ -40,6 +40,8 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1621: Compilation error with hadoop 2.7.0. (jinho) + TAJO-1619: JDBC program is stuck after closing. (jihoon) TAJO-1612: TestKillQuery occassionally fails. (hyunsik) diff --git a/tajo-core/pom.xml b/tajo-core/pom.xml index bc11d1c0c9..15c3e86af6 100644 --- a/tajo-core/pom.xml +++ b/tajo-core/pom.xml @@ -733,6 +733,7 @@ ${maven.fork.count} true + false -Xms512m -Xmx1024m -XX:MaxPermSize=128m -Dfile.encoding=UTF-8 true true diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/utils/ThreadUtil.java b/tajo-core/src/main/java/org/apache/tajo/engine/utils/ThreadUtil.java deleted file mode 100644 index 23b1e5d24d..0000000000 --- a/tajo-core/src/main/java/org/apache/tajo/engine/utils/ThreadUtil.java +++ /dev/null @@ -1,149 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.engine.utils; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.util.ReflectionUtils; - -import java.io.PrintWriter; -import java.lang.Thread.UncaughtExceptionHandler; - -public class ThreadUtil { - protected static final Log LOG = LogFactory.getLog(ThreadUtil.class); - - /** - * Utility method that sets name, daemon status and starts passed thread. - * @param t thread to run - * @return Returns the passed Thread t. - */ - public static Thread setDaemonThreadRunning(final Thread t) { - return setDaemonThreadRunning(t, t.getName()); - } - - /** - * Utility method that sets name, daemon status and starts passed thread. - * @param t thread to frob - * @param name new name - * @return Returns the passed Thread t. - */ - public static Thread setDaemonThreadRunning(final Thread t, - final String name) { - return setDaemonThreadRunning(t, name, null); - } - - /** - * Utility method that sets name, daemon status and starts passed thread. - * @param t thread to frob - * @param name new name - * @param handler A handler to set on the thread. Pass null if want to - * use default handler. - * @return Returns the passed Thread t. - */ - public static Thread setDaemonThreadRunning(final Thread t, - final String name, final UncaughtExceptionHandler handler) { - t.setName(name); - if (handler != null) { - t.setUncaughtExceptionHandler(handler); - } - t.setDaemon(true); - t.start(); - return t; - } - - /** - * Shutdown passed thread using isAlive and join. - * @param t Thread to shutdown - */ - public static void shutdown(final Thread t) { - shutdown(t, 0); - } - - /** - * Shutdown passed thread using isAlive and join. - * @param joinwait Pass 0 if we're to wait forever. - * @param t Thread to shutdown - */ - public static void shutdown(final Thread t, final long joinwait) { - if (t == null) return; - while (t.isAlive()) { - try { - t.join(joinwait); - } catch (InterruptedException e) { - LOG.warn(t.getName() + "; joinwait=" + joinwait, e); - } - } - } - - - /** - * @param t Waits on the passed thread to die dumping a threaddump every - * minute while its up. - * @throws InterruptedException - */ - public static void threadDumpingIsAlive(final Thread t) - throws InterruptedException { - if (t == null) { - return; - } - - while (t.isAlive()) { - t.join(60 * 1000); - if (t.isAlive()) { - ReflectionUtils.printThreadInfo(new PrintWriter(System.out), - "Automatic Stack Trace every 60 seconds waiting on " + - t.getName()); - } - } - } - - /** - * @param millis How long to sleep for in milliseconds. - */ - public static void sleep(int millis) { - try { - Thread.sleep(millis); - } catch (InterruptedException e) { - e.printStackTrace(); - } - } - - /** - * Sleeps for the given amount of time even if interrupted. Preserves - * the interrupt status. - * @param msToWait the amount of time to sleep in milliseconds - */ - public static void sleepWithoutInterrupt(final long msToWait) { - long timeMillis = System.currentTimeMillis(); - long endTime = timeMillis + msToWait; - boolean interrupted = false; - while (timeMillis < endTime) { - try { - Thread.sleep(endTime - timeMillis); - } catch (InterruptedException ex) { - interrupted = true; - } - timeMillis = System.currentTimeMillis(); - } - - if (interrupted) { - Thread.currentThread().interrupt(); - } - } -} diff --git a/tajo-core/src/main/java/org/apache/tajo/ha/HdfsServiceTracker.java b/tajo-core/src/main/java/org/apache/tajo/ha/HdfsServiceTracker.java index 5f1aff8b9d..d0eb9852f8 100644 --- a/tajo-core/src/main/java/org/apache/tajo/ha/HdfsServiceTracker.java +++ b/tajo-core/src/main/java/org/apache/tajo/ha/HdfsServiceTracker.java @@ -350,6 +350,13 @@ private class PingChecker implements Runnable { @Override public void run() { while (!stopped && !Thread.currentThread().isInterrupted()) { + try { + Thread.sleep(monitorInterval); + } catch (InterruptedException e) { + LOG.info("PingChecker interrupted. - masterName:" + masterName); + break; + } + synchronized (HdfsServiceTracker.this) { try { if (!currentActiveMaster.equals(masterName)) { @@ -371,12 +378,6 @@ public void run() { e.printStackTrace(); } } - try { - Thread.sleep(monitorInterval); - } catch (InterruptedException e) { - LOG.info("PingChecker interrupted. - masterName:" + masterName); - break; - } } } } diff --git a/tajo-core/src/test/java/org/apache/tajo/TajoTestingCluster.java b/tajo-core/src/test/java/org/apache/tajo/TajoTestingCluster.java index 17348e1b25..79a5944309 100644 --- a/tajo-core/src/test/java/org/apache/tajo/TajoTestingCluster.java +++ b/tajo-core/src/test/java/org/apache/tajo/TajoTestingCluster.java @@ -510,6 +510,7 @@ public void startMiniCluster(final int numSlaves, final String [] dataNodeHosts) startMiniDFSCluster(numDataNodes, clusterTestBuildDir, dataNodeHosts); this.dfsCluster.waitClusterUp(); + conf.setInt("hbase.hconnection.threads.core", 50); hbaseUtil = new HBaseTestClusterUtil(conf, clusterTestBuildDir); if(!standbyWorkerMode) { From d8bdfd761db3402ba78ecfa2d7887a031baabd6e Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Tue, 26 May 2015 21:54:07 -0700 Subject: [PATCH 064/141] TAJO-1623: INSERT INTO with wrong target columns causes NPE. Closes #587 --- CHANGES | 2 ++ .../java/org/apache/tajo/QueryTestCaseBase.java | 16 ++++++++++++---- .../engine/planner/TestQueryValidation.java | 17 ++++++++++++----- .../org/apache/tajo/plan/LogicalPlanner.java | 6 ++++++ 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/CHANGES b/CHANGES index 4603f3ed62..b4f4c0c48a 100644 --- a/CHANGES +++ b/CHANGES @@ -40,6 +40,8 @@ Release 0.10.1 - unreleased BUG FIXES + TAJO-1623: INSERT INTO with wrong target columns causes NPE. (hyunsik) + TAJO-1621: Compilation error with hadoop 2.7.0. (jinho) TAJO-1619: JDBC program is stuck after closing. (jihoon) diff --git a/tajo-core/src/test/java/org/apache/tajo/QueryTestCaseBase.java b/tajo-core/src/test/java/org/apache/tajo/QueryTestCaseBase.java index ddfa7a67fb..92d1bcd7a8 100644 --- a/tajo-core/src/test/java/org/apache/tajo/QueryTestCaseBase.java +++ b/tajo-core/src/test/java/org/apache/tajo/QueryTestCaseBase.java @@ -276,24 +276,32 @@ private static VerificationState verify(String query) throws PlanningException { return state; } - public void assertValidSQL(String fileName) throws PlanningException, IOException { - Path queryFilePath = getQueryFilePath(fileName); - String query = FileUtil.readTextFile(new File(queryFilePath.toUri())); + public void assertValidSQL(String query) throws PlanningException, IOException { VerificationState state = verify(query); if (state.getErrorMessages().size() > 0) { fail(state.getErrorMessages().get(0)); } } - public void assertInvalidSQL(String fileName) throws PlanningException, IOException { + public void assertValidSQLFromFile(String fileName) throws PlanningException, IOException { Path queryFilePath = getQueryFilePath(fileName); String query = FileUtil.readTextFile(new File(queryFilePath.toUri())); + assertValidSQL(query); + } + + public void assertInvalidSQL(String query) throws PlanningException, IOException { VerificationState state = verify(query); if (state.getErrorMessages().size() == 0) { fail(PreLogicalPlanVerifier.class.getSimpleName() + " cannot catch any verification error: " + query); } } + public void assertInvalidSQLFromFile(String fileName) throws PlanningException, IOException { + Path queryFilePath = getQueryFilePath(fileName); + String query = FileUtil.readTextFile(new File(queryFilePath.toUri())); + assertInvalidSQL(query); + } + public void assertPlanError(String fileName) throws PlanningException, IOException { Path queryFilePath = getQueryFilePath(fileName); String query = FileUtil.readTextFile(new File(queryFilePath.toUri())); diff --git a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestQueryValidation.java b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestQueryValidation.java index b6827a2e4b..fd60a5e53f 100644 --- a/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestQueryValidation.java +++ b/tajo-core/src/test/java/org/apache/tajo/engine/planner/TestQueryValidation.java @@ -25,19 +25,26 @@ import java.io.IOException; public class TestQueryValidation extends QueryTestCaseBase { + + @Test + public void testInsertWithWrongTargetColumn() throws Exception { + executeString("CREATE TABLE T1 (col1 int, col2 int)").close(); + assertInvalidSQL("INSERT INTO T1 (col1, col3) select l_orderkey, l_partkey from default.lineitem"); + } + @Test public void testLimitClauses() throws PlanningException, IOException { // select * from lineitem limit 3; - assertValidSQL("valid_limit_1.sql"); + assertValidSQLFromFile("valid_limit_1.sql"); // select * from lineitem limit l_orderkey; - assertInvalidSQL("invalid_limit_1.sql"); + assertInvalidSQLFromFile("invalid_limit_1.sql"); } @Test public void testGroupByClauses() throws PlanningException, IOException { // select l_orderkey from lineitem group by l_orderkey; - assertValidSQL("valid_groupby_1.sql"); + assertValidSQLFromFile("valid_groupby_1.sql"); // select * from lineitem group by l_orderkey; assertPlanError("error_groupby_1.sql"); @@ -48,12 +55,12 @@ public void testGroupByClauses() throws PlanningException, IOException { @Test public void testCaseWhenExprs() throws PlanningException, IOException { // See TAJO-1098 - assertInvalidSQL("invalid_casewhen_1.sql"); + assertInvalidSQLFromFile("invalid_casewhen_1.sql"); } @Test public void testUnsupportedStoreType() throws PlanningException, IOException { // See TAJO-1249 - assertInvalidSQL("invalid_store_format.sql"); + assertInvalidSQLFromFile("invalid_store_format.sql"); } } diff --git a/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java b/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java index 27cd3b0bd1..435b3e3539 100644 --- a/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java +++ b/tajo-plan/src/main/java/org/apache/tajo/plan/LogicalPlanner.java @@ -1534,6 +1534,12 @@ private InsertNode buildInsertIntoTablePlan(PlanContext context, InsertNode inse Schema targetColumns = new Schema(); for (int i = 0; i < targets.length; i++) { Column targetColumn = desc.getLogicalSchema().getColumn(targets[i]); + + if (targetColumn == null) { + throw new PlanningException("column \"" + targets[i] + "\" of relation \"" + + desc.getName() + "\" does not exist"); + } + targetColumns.addColumn(targetColumn); } insertNode.setTargetSchema(targetColumns); From c94edd86a17cf9e26ffb58787a8f7dcf13cfbf43 Mon Sep 17 00:00:00 2001 From: Hyunsik Choi Date: Thu, 28 May 2015 05:06:32 -0700 Subject: [PATCH 065/141] TAJO-1626: JdbcConnection::setAutoCommit() should not throw an exception. --- CHANGES | 3 +++ .../src/main/java/org/apache/tajo/jdbc/JdbcConnection.java | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index b4f4c0c48a..121e47fb37 100644 --- a/CHANGES +++ b/CHANGES @@ -9,6 +9,9 @@ Release 0.10.1 - unreleased IMPROVEMENT + TAJO-1626: JdbcConnection::setAutoCommit() should not throw an exception. + (hyunsik) + TAJO-1452: Improve function listing order (Contributed Dongjoon Hyun, Committed by hyunsik) diff --git a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/JdbcConnection.java b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/JdbcConnection.java index 287954083c..85b6af3006 100644 --- a/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/JdbcConnection.java +++ b/tajo-jdbc/src/main/java/org/apache/tajo/jdbc/JdbcConnection.java @@ -358,7 +358,7 @@ public void rollback(Savepoint savepoint) throws SQLException { @Override public void setAutoCommit(boolean autoCommit) throws SQLException { - throw new SQLFeatureNotSupportedException("setAutoCommit"); + LOG.warn("Tajo does not support setAutoCommit, so this invocation is ignored."); } @Override From e58084487d95b13d3a507b09a19eed93dee9fd35 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Mon, 18 May 2015 20:34:45 +0900 Subject: [PATCH 066/141] Sources based JDK 1.7 are applied from Presto --- tajo-storage/tajo-storage-hdfs/pom.xml | 63 +++ .../storage/thirdparty/orc/BooleanVector.java | 36 ++ .../storage/thirdparty/orc/DiskRange.java | 77 +++ .../storage/thirdparty/orc/DoubleVector.java | 36 ++ .../thirdparty/orc/FileOrcDataSource.java | 125 +++++ .../thirdparty/orc/HdfsOrcDataSource.java | 125 +++++ .../storage/thirdparty/orc/LongVector.java | 36 ++ .../storage/thirdparty/orc/ObjectVector.java | 29 ++ .../orc/OrcCorruptionException.java | 43 ++ .../storage/thirdparty/orc/OrcDataSource.java | 37 ++ .../thirdparty/orc/OrcDataSourceUtils.java | 82 ++++ .../storage/thirdparty/orc/OrcReader.java | 219 +++++++++ .../thirdparty/orc/OrcRecordReader.java | 321 +++++++++++++ .../storage/thirdparty/orc/SliceVector.java | 36 ++ .../thirdparty/orc/StreamDescriptor.java | 83 ++++ .../tajo/storage/thirdparty/orc/StreamId.java | 77 +++ .../tajo/storage/thirdparty/orc/Stripe.java | 70 +++ .../storage/thirdparty/orc/StripeReader.java | 352 ++++++++++++++ .../checkpoint/BooleanStreamCheckpoint.java | 58 +++ .../checkpoint/ByteArrayStreamCheckpoint.java | 50 ++ .../orc/checkpoint/ByteStreamCheckpoint.java | 60 +++ .../orc/checkpoint/Checkpoints.java | 405 ++++++++++++++++ .../checkpoint/DoubleStreamCheckpoint.java | 50 ++ .../orc/checkpoint/FloatStreamCheckpoint.java | 50 ++ .../orc/checkpoint/InputStreamCheckpoint.java | 64 +++ .../checkpoint/LongStreamDwrfCheckpoint.java | 50 ++ .../checkpoint/LongStreamV1Checkpoint.java | 60 +++ .../checkpoint/LongStreamV2Checkpoint.java | 60 +++ ...GroupDictionaryLengthStreamCheckpoint.java | 53 ++ .../orc/json/BooleanJsonReader.java | 117 +++++ .../thirdparty/orc/json/ByteJsonReader.java | 118 +++++ .../thirdparty/orc/json/DateJsonReader.java | 123 +++++ .../thirdparty/orc/json/DoubleJsonReader.java | 120 +++++ .../thirdparty/orc/json/FloatJsonReader.java | 122 +++++ .../thirdparty/orc/json/JsonMapKeyReader.java | 23 + .../thirdparty/orc/json/JsonReader.java | 36 ++ .../thirdparty/orc/json/JsonReaders.java | 100 ++++ .../thirdparty/orc/json/ListJsonReader.java | 125 +++++ .../orc/json/LongDictionaryJsonReader.java | 142 ++++++ .../orc/json/LongDirectJsonReader.java | 112 +++++ .../thirdparty/orc/json/LongJsonReader.java | 99 ++++ .../thirdparty/orc/json/MapJsonReader.java | 138 ++++++ .../orc/json/SliceDictionaryJsonReader.java | 269 +++++++++++ .../orc/json/SliceDirectJsonReader.java | 168 +++++++ .../thirdparty/orc/json/SliceJsonReader.java | 98 ++++ .../thirdparty/orc/json/StructJsonReader.java | 117 +++++ .../orc/json/TimestampJsonReader.java | 134 ++++++ .../orc/metadata/DwrfMetadataReader.java | 367 ++++++++++++++ .../orc/metadata/OrcMetadataReader.java | 402 ++++++++++++++++ .../orc/metadata/StringStatistics.java | 39 ++ .../orc/reader/BooleanStreamReader.java | 153 ++++++ .../orc/reader/ByteStreamReader.java | 155 ++++++ .../orc/reader/DoubleStreamReader.java | 155 ++++++ .../orc/reader/FloatStreamReader.java | 156 ++++++ .../orc/reader/JsonStreamReader.java | 180 +++++++ .../reader/LongDictionaryStreamReader.java | 210 ++++++++ .../orc/reader/LongDirectStreamReader.java | 155 ++++++ .../orc/reader/LongStreamReader.java | 88 ++++ .../reader/SliceDictionaryStreamReader.java | 287 +++++++++++ .../orc/reader/SliceDirectStreamReader.java | 198 ++++++++ .../orc/reader/SliceStreamReader.java | 88 ++++ .../thirdparty/orc/reader/StreamReaders.java | 58 +++ .../orc/reader/TimestampStreamReader.java | 217 +++++++++ .../orc/stream/ByteArrayStream.java | 67 +++ .../thirdparty/orc/stream/ByteStream.java | 134 ++++++ .../orc/stream/CheckpointStreamSource.java | 69 +++ .../thirdparty/orc/stream/DoubleStream.java | 104 ++++ .../thirdparty/orc/stream/FloatStream.java | 109 +++++ .../thirdparty/orc/stream/LongDecode.java | 177 +++++++ .../thirdparty/orc/stream/LongStreamDwrf.java | 129 +++++ .../thirdparty/orc/stream/LongStreamV1.java | 184 +++++++ .../thirdparty/orc/stream/LongStreamV2.java | 452 ++++++++++++++++++ .../thirdparty/orc/stream/OrcInputStream.java | 274 +++++++++++ .../thirdparty/orc/stream/OrcStreamUtils.java | 61 +++ .../thirdparty/orc/stream/StreamSources.java | 56 +++ 75 files changed, 9692 insertions(+) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 05afba362d..5b36262539 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -352,6 +352,69 @@ net.minidev json-smart + + io.airlift + slice + 0.7 + + + io.airlift + units + 0.97 + + + com.google.guava + guava + 18.0 + + + joda-time + joda-time + 2.4 + + + org.iq80.snappy + snappy + 0.2 + + + com.facebook.presto.hive + hive-apache + 0.9 + + + org.jetbrains + annotations + 13.0 + + + com.fasterxml.jackson.core + jackson-core + 2.4.2 + + + com.facebook.hive + hive-dwrf + 0.8 + + + commons-logging + commons-logging + + + org.iq80.snappy + snappy + + + com.facebook.presto.hadoop + hadoop-cdh4 + + + it.unimi.dsi + fastutil + + + diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java new file mode 100644 index 0000000000..aaa1ada35c --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; + +public class BooleanVector + implements Vector +{ + public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; + public final boolean[] vector = new boolean[MAX_VECTOR_LENGTH]; + + @Override + @VisibleForTesting + public ObjectVector toObjectVector(int size) + { + ObjectVector objectVector = new ObjectVector(); + for (int i = 0; i < size; i++) { + if (!isNull[i]) { + objectVector.vector[i] = vector[i]; + } + } + return objectVector; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java new file mode 100644 index 0000000000..8a3f249c3f --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java @@ -0,0 +1,77 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.primitives.Ints; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +public final class DiskRange +{ + private final long offset; + private final int length; + + public DiskRange(long offset, int length) + { + checkArgument(offset >= 0, "offset is negative"); + checkArgument(length >= 0, "length is negative"); + + this.offset = offset; + this.length = length; + } + + public long getOffset() + { + return offset; + } + + public int getLength() + { + return length; + } + + public long getEnd() + { + return offset + length; + } + + public boolean contains(DiskRange diskRange) + { + return offset <= diskRange.getOffset() && diskRange.getEnd() <= getEnd(); + } + + /** + * Returns the minimal DiskRange that encloses both this DiskRange + * and otherDiskRange. If there was a gap between the ranges the + * new range will cover that gap. + */ + public DiskRange span(DiskRange otherDiskRange) + { + checkNotNull(otherDiskRange, "otherDiskRange is null"); + long start = Math.min(this.offset, otherDiskRange.getOffset()); + long end = Math.max(getEnd(), otherDiskRange.getEnd()); + return new DiskRange(start, Ints.checkedCast(end - start)); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("offset", offset) + .add("length", length) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java new file mode 100644 index 0000000000..8f20d29590 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; + +public class DoubleVector + implements Vector +{ + public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; + public final double[] vector = new double[MAX_VECTOR_LENGTH]; + + @Override + @VisibleForTesting + public ObjectVector toObjectVector(int size) + { + ObjectVector objectVector = new ObjectVector(); + for (int i = 0; i < size; i++) { + if (!isNull[i]) { + objectVector.vector[i] = vector[i]; + } + } + return objectVector; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java new file mode 100644 index 0000000000..3d0c42eb89 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java @@ -0,0 +1,125 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.ImmutableMap; +import io.airlift.slice.Slice; +import io.airlift.units.DataSize; + +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Map.Entry; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.getDiskRangeSlice; +import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; + +public class FileOrcDataSource + implements OrcDataSource +{ + private final File path; + private final long size; + private final RandomAccessFile input; + private final DataSize maxMergeDistance; + private long readTimeNanos; + + public FileOrcDataSource(File path, DataSize maxMergeDistance) + throws IOException + { + this.path = checkNotNull(path, "path is null"); + this.size = path.length(); + this.input = new RandomAccessFile(path, "r"); + + this.maxMergeDistance = checkNotNull(maxMergeDistance, "maxMergeDistance is null"); + } + + @Override + public void close() + throws IOException + { + input.close(); + } + + @Override + public long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public long getSize() + { + return size; + } + + @Override + public void readFully(long position, byte[] buffer) + throws IOException + { + readFully(position, buffer, 0, buffer.length); + } + + @Override + public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException + { + long start = System.nanoTime(); + + input.seek(position); + input.readFully(buffer, bufferOffset, bufferLength); + + readTimeNanos += System.nanoTime() - start; + } + + @Override + public Map readFully(Map diskRanges) + throws IOException + { + checkNotNull(diskRanges, "diskRanges is null"); + + if (diskRanges.isEmpty()) { + return ImmutableMap.of(); + } + + // TODO: benchmark alternatively strategies: + // 1) sort ranges and perform one read per range + // 2) single read with transferTo() using custom WritableByteChannel + + Iterable mergedRanges = mergeAdjacentDiskRanges(diskRanges.values(), maxMergeDistance); + + // read ranges + Map buffers = new LinkedHashMap(); + for (DiskRange mergedRange : mergedRanges) { + // read full range in one request + byte[] buffer = new byte[mergedRange.getLength()]; + readFully(mergedRange.getOffset(), buffer); + buffers.put(mergedRange, buffer); + } + + ImmutableMap.Builder slices = ImmutableMap.builder(); + for (Entry entry : diskRanges.entrySet()) { + slices.put(entry.getKey(), getDiskRangeSlice(entry.getValue(), buffers)); + } + return slices.build(); + } + + @Override + public String toString() + { + return path.getPath(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java new file mode 100644 index 0000000000..a373c27581 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java @@ -0,0 +1,125 @@ + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.tajo.storage.thirdparty.orc.DiskRange; +import org.apache.tajo.storage.thirdparty.orc.OrcDataSource; +import com.google.common.collect.ImmutableMap; +import io.airlift.slice.Slice; +import io.airlift.units.DataSize; +import org.apache.hadoop.fs.FSDataInputStream; + +import java.io.IOException; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Map.Entry; + +import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.getDiskRangeSlice; +import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +public class HdfsOrcDataSource + implements OrcDataSource +{ + private final FSDataInputStream inputStream; + private final String path; + private final long size; + private final DataSize maxMergeDistance; + private long readTimeNanos; + + public HdfsOrcDataSource(String path, FSDataInputStream inputStream, long size, DataSize maxMergeDistance) + { + this.path = checkNotNull(path, "path is null"); + this.inputStream = checkNotNull(inputStream, "inputStream is null"); + this.size = size; + checkArgument(size >= 0, "size is negative"); + + this.maxMergeDistance = checkNotNull(maxMergeDistance, "maxMergeDistance is null"); + } + + @Override + public void close() + throws IOException + { + inputStream.close(); + } + + @Override + public long getReadTimeNanos() + { + return readTimeNanos; + } + + @Override + public long getSize() + { + return size; + } + + @Override + public void readFully(long position, byte[] buffer) + throws IOException + { + readFully(position, buffer, 0, buffer.length); + } + + @Override + public void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException + { + long start = System.nanoTime(); + + inputStream.readFully(position, buffer, bufferOffset, bufferLength); + + readTimeNanos += System.nanoTime() - start; + } + + @Override + public Map readFully(Map diskRanges) + throws IOException + { + checkNotNull(diskRanges, "diskRanges is null"); + + if (diskRanges.isEmpty()) { + return ImmutableMap.of(); + } + + Iterable mergedRanges = mergeAdjacentDiskRanges(diskRanges.values(), maxMergeDistance); + + // read ranges + Map buffers = new LinkedHashMap(); + for (DiskRange mergedRange : mergedRanges) { + // read full range in one request + byte[] buffer = new byte[mergedRange.getLength()]; + readFully(mergedRange.getOffset(), buffer); + buffers.put(mergedRange, buffer); + } + + ImmutableMap.Builder slices = ImmutableMap.builder(); + for (Entry entry : diskRanges.entrySet()) { + slices.put(entry.getKey(), getDiskRangeSlice(entry.getValue(), buffers)); + } + return slices.build(); + } + + @Override + public String toString() + { + return path; + } +} + + diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java new file mode 100644 index 0000000000..7c9407a3e6 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; + +public class LongVector + implements Vector +{ + public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; + public final long[] vector = new long[MAX_VECTOR_LENGTH]; + + @Override + @VisibleForTesting + public ObjectVector toObjectVector(int size) + { + ObjectVector objectVector = new ObjectVector(); + for (int i = 0; i < size; i++) { + if (!isNull[i]) { + objectVector.vector[i] = vector[i]; + } + } + return objectVector; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java new file mode 100644 index 0000000000..19f9608f7d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java @@ -0,0 +1,29 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; + +public class ObjectVector + implements Vector +{ + public final Object[] vector = new Object[MAX_VECTOR_LENGTH]; + + @Override + @VisibleForTesting + public ObjectVector toObjectVector(int size) + { + return this; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java new file mode 100644 index 0000000000..c780bcb51f --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java @@ -0,0 +1,43 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.jetbrains.annotations.Contract; + +import java.io.IOException; + +import static java.lang.String.format; + +public class OrcCorruptionException + extends IOException +{ + @Contract("false, _, _ -> fail") + public static void verifyFormat(boolean test, String messageFormat, Object... args) + throws OrcCorruptionException + { + if (!test) { + throw new OrcCorruptionException(messageFormat, args); + } + } + + public OrcCorruptionException(String messageFormat, Object... args) + { + super(format(messageFormat, args)); + } + + public OrcCorruptionException(Throwable cause, String messageFormat, Object... args) + { + super(format(messageFormat, args), cause); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java new file mode 100644 index 0000000000..8eb1cbdd00 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java @@ -0,0 +1,37 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import io.airlift.slice.Slice; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Map; + +public interface OrcDataSource + extends Closeable +{ + long getReadTimeNanos(); + + long getSize(); + + void readFully(long position, byte[] buffer) + throws IOException; + + void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) + throws IOException; + + Map readFully(Map diskRanges) + throws IOException; +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java new file mode 100644 index 0000000000..ba65c3c55c --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java @@ -0,0 +1,82 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.primitives.Ints; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import io.airlift.units.DataSize; + +import java.util.*; +import java.util.Map.Entry; + +import static com.google.common.collect.Lists.newArrayList; + +public final class OrcDataSourceUtils +{ + private OrcDataSourceUtils() + { + } + + /** + * Merge disk ranges that are closer than {@code maxMergeDistance}. + */ + public static Iterable mergeAdjacentDiskRanges(Iterable diskRanges, DataSize maxMergeDistance) + { + // sort ranges by start offset + List ranges = newArrayList(diskRanges); + Collections.sort(ranges, new Comparator() { + @Override + public int compare(DiskRange o1, DiskRange o2) { + return Long.compare(o1.getOffset(), o2.getOffset()); + } + }); + + // merge overlapping ranges + long maxMergeDistanceBytes = maxMergeDistance.toBytes(); + List result = new ArrayList(); + DiskRange last = ranges.get(0); + for (int i = 1; i < ranges.size(); i++) { + DiskRange current = ranges.get(i); + if (last.getEnd() + maxMergeDistanceBytes + 1 >= current.getOffset()) { + last = last.span(current); + } + else { + result.add(last); + last = current; + } + } + result.add(last); + + return result; + } + + /** + * Get a slice for the disk range from the provided buffers. The buffers ranges do not have + * to exactly match {@code diskRange}, but {@code diskRange} must be completely contained within + * one of the buffer ranges. + */ + public static Slice getDiskRangeSlice(DiskRange diskRange, Map buffers) + { + for (Entry bufferEntry : buffers.entrySet()) { + DiskRange bufferRange = bufferEntry.getKey(); + byte[] buffer = bufferEntry.getValue(); + if (bufferRange.contains(diskRange)) { + int offset = Ints.checkedCast(diskRange.getOffset() - bufferRange.getOffset()); + return Slices.wrappedBuffer(buffer, offset, diskRange.getLength()); + } + } + throw new IllegalStateException("No matching buffer for disk range"); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java new file mode 100644 index 0000000000..144baa5e7b --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java @@ -0,0 +1,219 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.base.Joiner; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.metadata.*; +import org.apache.tajo.storage.thirdparty.orc.stream.OrcInputStream; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.joda.time.DateTimeZone; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static io.airlift.slice.SizeOf.SIZE_OF_BYTE; + +public class OrcReader +{ + private static final Slice MAGIC = Slices.utf8Slice("ORC"); + private static final int CURRENT_MAJOR_VERSION = 0; + private static final int CURRENT_MINOR_VERSION = 12; + private static final int EXPECTED_FOOTER_SIZE = 16 * 1024; + + private final OrcDataSource orcDataSource; + private final MetadataReader metadataReader; + private final CompressionKind compressionKind; + private final int bufferSize; + private final Footer footer; + private final Metadata metadata; + + // This is based on the Apache Hive ORC code + public OrcReader(OrcDataSource orcDataSource, MetadataReader metadataReader) + throws IOException + { + this.orcDataSource = checkNotNull(orcDataSource, "orcDataSource is null"); + this.metadataReader = checkNotNull(metadataReader, "metadataReader is null"); + + // + // Read the file tail: + // + // variable: Footer + // variable: Metadata + // variable: PostScript - contains length of footer and metadata + // 3 bytes: file magic "ORC" + // 1 byte: postScriptSize = PostScript + Magic + + // figure out the size of the file using the option or filesystem + long size = orcDataSource.getSize(); + + // Read the tail of the file + byte[] buffer = new byte[(int) Math.min(size, EXPECTED_FOOTER_SIZE)]; + orcDataSource.readFully(size - buffer.length, buffer); + + // get length of PostScript - last byte of the file + int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff; + + // make sure this is an ORC file and not an RCFile or something else + verifyOrcFooter(orcDataSource, postScriptSize, buffer); + + // decode the post script + int postScriptOffset = buffer.length - SIZE_OF_BYTE - postScriptSize; + PostScript postScript = metadataReader.readPostScript(buffer, postScriptOffset, postScriptSize); + + // verify this is a supported version + checkOrcVersion(orcDataSource, postScript.getVersion()); + + // check compression codec is supported + this.compressionKind = postScript.getCompression(); + + this.bufferSize = Ints.checkedCast(postScript.getCompressionBlockSize()); + + int footerSize = Ints.checkedCast(postScript.getFooterLength()); + int metadataSize = Ints.checkedCast(postScript.getMetadataLength()); + + // check if extra bytes need to be read + Slice completeFooterSlice; + int completeFooterSize = footerSize + metadataSize + postScriptSize + SIZE_OF_BYTE; + if (completeFooterSize > buffer.length) { + // allocate a new buffer large enough for the complete footer + byte[] newBuffer = new byte[completeFooterSize]; + completeFooterSlice = Slices.wrappedBuffer(newBuffer); + + // initial read was not large enough, so read missing section + orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length); + + // copy already read bytes into the new buffer + completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer); + } + else { + // footer is already in the bytes in buffer, just adjust position, length + completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize); + } + + // read metadata + Slice metadataSlice = completeFooterSlice.slice(0, metadataSize); + InputStream metadataInputStream = new OrcInputStream(orcDataSource.toString(), metadataSlice.getInput(), compressionKind, bufferSize); + this.metadata = metadataReader.readMetadata(metadataInputStream); + + // read footer + Slice footerSlice = completeFooterSlice.slice(metadataSize, footerSize); + InputStream footerInputStream = new OrcInputStream(orcDataSource.toString(), footerSlice.getInput(), compressionKind, bufferSize); + this.footer = metadataReader.readFooter(footerInputStream); + } + + public List getColumnNames() + { + return footer.getTypes().get(0).getFieldNames(); + } + + public Footer getFooter() + { + return footer; + } + + public Metadata getMetadata() + { + return metadata; + } + + public CompressionKind getCompressionKind() + { + return compressionKind; + } + + public int getBufferSize() + { + return bufferSize; + } + + public OrcRecordReader createRecordReader( + Set includedColumns, + OrcPredicate predicate, + long offset, + long length, + DateTimeZone hiveStorageTimeZone) + throws IOException + { + return new OrcRecordReader( + checkNotNull(includedColumns, "includedColumns is null"), + checkNotNull(predicate, "predicate is null"), + footer.getNumberOfRows(), + footer.getStripes(), + footer.getFileStats(), + metadata.getStripeStatsList(), + orcDataSource, + offset, + length, + footer.getTypes(), + compressionKind, + bufferSize, + footer.getRowsInRowGroup(), + checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"), + metadataReader); + } + + /** + * Verify this is an ORC file to prevent users from trying to read text + * files or RC files as ORC files. + */ + // This is based on the Apache Hive ORC code + private static void verifyOrcFooter( + OrcDataSource source, + int postScriptSize, + byte[] buffer) + throws IOException + { + int magicLength = MAGIC.length(); + checkArgument(postScriptSize >= magicLength + 1, "Malformed ORC file %s. Invalid postscript length %s", source, postScriptSize); + + if (!MAGIC.equals(Slices.wrappedBuffer(buffer, buffer.length - 1 - magicLength, magicLength))) { + // Old versions of ORC (0.11) wrote the magic to the head of the file + byte[] headerMagic = new byte[magicLength]; + source.readFully(0, headerMagic); + + // if it isn't there, this isn't an ORC file + checkArgument(MAGIC.equals(Slices.wrappedBuffer(headerMagic)), "Malformed ORC file %s. Invalid postscript.", source); + } + } + + /** + * Check to see if this ORC file is from a future version and if so, + * warn the user that we may not be able to read all of the column encodings. + */ + // This is based on the Apache Hive ORC code + private static void checkOrcVersion(OrcDataSource orcDataSource, List version) + { + if (version.size() >= 1) { + int major = version.get(0); + int minor = 0; + if (version.size() > 1) { + minor = version.get(1); + } + + if (major > CURRENT_MAJOR_VERSION || (major == CURRENT_MAJOR_VERSION && minor > CURRENT_MINOR_VERSION)) { + System.err.println(String.format("ORC file %s was written by a newer Hive version %s. This file may not be readable by this version of Hive (%s.%s).", + orcDataSource, + Joiner.on('.').join(version), + CURRENT_MAJOR_VERSION, + CURRENT_MINOR_VERSION)); + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java new file mode 100644 index 0000000000..9f0e78300d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java @@ -0,0 +1,321 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.metadata.*; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.reader.StreamReader; +import org.apache.tajo.storage.thirdparty.orc.reader.StreamReaders; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTimeZone; + +import java.io.IOException; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; + +public class OrcRecordReader +{ + private final OrcDataSource orcDataSource; + + private final StreamReader[] streamReaders; + + private final long totalRowCount; + private final long splitLength; + private final Set presentColumns; + private long currentPosition; + + private final List stripes; + private final StripeReader stripeReader; + private int currentStripe = -1; + + private Iterator rowGroups = ImmutableList.of().iterator(); + private long currentGroupRowCount; + private long nextRowInGroup; + + public OrcRecordReader( + Set includedColumns, + OrcPredicate predicate, + long numberOfRows, + List fileStripes, + List fileStats, + List stripeStats, + OrcDataSource orcDataSource, + long splitOffset, + long splitLength, + List types, + CompressionKind compressionKind, + int bufferSize, + int rowsInRowGroup, + DateTimeZone hiveStorageTimeZone, + MetadataReader metadataReader) + throws IOException + { + checkNotNull(includedColumns, "includedColumns is null"); + checkNotNull(predicate, "predicate is null"); + checkNotNull(fileStripes, "fileStripes is null"); + checkNotNull(stripeStats, "stripeStats is null"); + checkNotNull(orcDataSource, "orcDataSource is null"); + checkNotNull(types, "types is null"); + checkNotNull(compressionKind, "compressionKind is null"); + checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"); + + // reduce the included columns to the set that is also present + ImmutableSet.Builder presentColumns = ImmutableSet.builder(); + OrcType root = types.get(0); + for (int includedColumn : includedColumns) { + // an old file can have less columns since columns can be added + // after the file was written + if (includedColumn < root.getFieldCount()) { + presentColumns.add(includedColumn); + } + } + this.presentColumns = presentColumns.build(); + + this.orcDataSource = orcDataSource; + this.splitLength = splitLength; + + // it is possible that old versions of orc use 0 to mean there are no row groups + checkArgument(rowsInRowGroup > 0, "rowsInRowGroup must be greater than zero"); + + long totalRowCount = 0; + ImmutableList.Builder stripes = ImmutableList.builder(); + if (predicate.matches(numberOfRows, getStatisticsByColumnOrdinal(root, fileStats))) { + // select stripes that start within the specified split + for (int stripeIndex = 0; stripeIndex < fileStripes.size(); stripeIndex++) { + StripeInformation stripe = fileStripes.get(stripeIndex); + if (splitContainsStripe(splitOffset, splitLength, stripe) && isStripeIncluded(root, stripe, stripeStats, predicate, stripeIndex)) { + stripes.add(stripe); + totalRowCount += stripe.getNumberOfRows(); + } + } + } + this.totalRowCount = totalRowCount; + this.stripes = stripes.build(); + + stripeReader = new StripeReader( + orcDataSource, + compressionKind, + types, + bufferSize, + this.presentColumns, + rowsInRowGroup, + predicate, + metadataReader); + + streamReaders = createStreamReaders(orcDataSource, types, hiveStorageTimeZone, this.presentColumns); + } + + private static boolean splitContainsStripe(long splitOffset, long splitLength, StripeInformation stripe) + { + long splitEndOffset = splitOffset + splitLength; + return splitOffset <= stripe.getOffset() && stripe.getOffset() < splitEndOffset; + } + + private static boolean isStripeIncluded( + OrcType rootStructType, + StripeInformation stripe, + List stripeStats, + OrcPredicate predicate, + int stripeIndex) + { + // if there are no stats, include the column + if (stripeIndex >= stripeStats.size()) { + return true; + } + + return predicate.matches(stripe.getNumberOfRows(), getStatisticsByColumnOrdinal(rootStructType, stripeStats.get(stripeIndex).getColumnStatistics())); + } + + public long getPosition() + { + return currentPosition; + } + + public long getTotalRowCount() + { + return totalRowCount; + } + + public float getProgress() + { + return ((float) currentPosition) / totalRowCount; + } + + public long getSplitLength() + { + return splitLength; + } + + public void close() + throws IOException + { + orcDataSource.close(); + } + + public boolean isColumnPresent(int hiveColumnIndex) + { + return presentColumns.contains(hiveColumnIndex); + } + + public int nextBatch() + throws IOException + { + // if next row is within the current group return + if (nextRowInGroup >= currentGroupRowCount) { + // attempt to advance to next row group + if (!advanceToNextRowGroup()) { + return -1; + } + } + + int batchSize = Ints.checkedCast(Math.min(Vector.MAX_VECTOR_LENGTH, currentGroupRowCount - nextRowInGroup)); + + for (StreamReader column : streamReaders) { + if (column != null) { + column.prepareNextRead(batchSize); + } + } + nextRowInGroup += batchSize; + currentPosition += batchSize; + return batchSize; + } + + public void readVector(int columnIndex, Object vector) + throws IOException + { + streamReaders[columnIndex].readBatch(vector); + } + + private boolean advanceToNextRowGroup() + throws IOException + { + nextRowInGroup = 0; + + while (!rowGroups.hasNext() && currentStripe < stripes.size()) { + advanceToNextStripe(); + } + + if (!rowGroups.hasNext()) { + currentGroupRowCount = 0; + return false; + } + + RowGroup currentRowGroup = rowGroups.next(); + currentGroupRowCount = currentRowGroup.getRowCount(); + + // give reader data streams from row group + StreamSources rowGroupStreamSources = currentRowGroup.getStreamSources(); + for (StreamReader column : streamReaders) { + if (column != null) { + column.startRowGroup(rowGroupStreamSources); + } + } + + return true; + } + + private void advanceToNextStripe() + throws IOException + { + currentStripe++; + if (currentStripe >= stripes.size()) { + return; + } + + StripeInformation stripeInformation = stripes.get(currentStripe); + Stripe stripe = stripeReader.readStripe(stripeInformation); + if (stripe != null) { + // Give readers access to dictionary streams + StreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources(); + List columnEncodings = stripe.getColumnEncodings(); + for (StreamReader column : streamReaders) { + if (column != null) { + column.startStripe(dictionaryStreamSources, columnEncodings); + } + } + + rowGroups = stripe.getRowGroups().iterator(); + } + else { + rowGroups = ImmutableList.of().iterator(); + } + } + + private static StreamReader[] createStreamReaders(OrcDataSource orcDataSource, + List types, + DateTimeZone hiveStorageTimeZone, + Set includedColumns) + { + List streamDescriptors = createStreamDescriptor("", "", 0, types, orcDataSource).getNestedStreams(); + + OrcType rowType = types.get(0); + StreamReader[] streamReaders = new StreamReader[rowType.getFieldCount()]; + for (int columnId = 0; columnId < rowType.getFieldCount(); columnId++) { + if (includedColumns.contains(columnId)) { + StreamDescriptor streamDescriptor = streamDescriptors.get(columnId); + streamReaders[columnId] = StreamReaders.createStreamReader(streamDescriptor, hiveStorageTimeZone); + } + } + return streamReaders; + } + + private static StreamDescriptor createStreamDescriptor(String parentStreamName, String fieldName, int typeId, List types, OrcDataSource dataSource) + { + OrcType type = types.get(typeId); + + if (!fieldName.isEmpty()) { + parentStreamName += "." + fieldName; + } + + ImmutableList.Builder nestedStreams = ImmutableList.builder(); + if (type.getOrcTypeKind() == OrcTypeKind.STRUCT) { + for (int i = 0; i < type.getFieldCount(); ++i) { + nestedStreams.add(createStreamDescriptor(parentStreamName, type.getFieldName(i), type.getFieldTypeIndex(i), types, dataSource)); + } + } + else if (type.getOrcTypeKind() == OrcTypeKind.LIST) { + nestedStreams.add(createStreamDescriptor(parentStreamName, "item", type.getFieldTypeIndex(0), types, dataSource)); + } + else if (type.getOrcTypeKind() == OrcTypeKind.MAP) { + nestedStreams.add(createStreamDescriptor(parentStreamName, "key", type.getFieldTypeIndex(0), types, dataSource)); + nestedStreams.add(createStreamDescriptor(parentStreamName, "value", type.getFieldTypeIndex(1), types, dataSource)); + } + return new StreamDescriptor(parentStreamName, typeId, fieldName, type.getOrcTypeKind(), dataSource, nestedStreams.build()); + } + + private static Map getStatisticsByColumnOrdinal(OrcType rootStructType, List fileStats) + { + checkNotNull(rootStructType, "rootStructType is null"); + checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); + checkNotNull(fileStats, "fileStats is null"); + + ImmutableMap.Builder statistics = ImmutableMap.builder(); + for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { + ColumnStatistics element = fileStats.get(rootStructType.getFieldTypeIndex(ordinal)); + if (element != null) { + statistics.put(ordinal, element); + } + } + return statistics.build(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java new file mode 100644 index 0000000000..01cfbfca80 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; +import io.airlift.slice.Slice; + +public class SliceVector + implements Vector +{ + public final Slice[] vector = new Slice[MAX_VECTOR_LENGTH]; + + @Override + @VisibleForTesting + public ObjectVector toObjectVector(int size) + { + ObjectVector objectVector = new ObjectVector(); + for (int i = 0; i < size; i++) { + if (vector[i] != null) { + objectVector.vector[i] = vector[i].toStringUtf8(); + } + } + return objectVector; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java new file mode 100644 index 0000000000..a8108e6f36 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java @@ -0,0 +1,83 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.ImmutableList; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; + +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public final class StreamDescriptor +{ + private final String streamName; + private final int streamId; + private final OrcTypeKind streamType; + private final String fieldName; + private final OrcDataSource fileInput; + private final List nestedStreams; + + public StreamDescriptor(String streamName, int streamId, String fieldName, OrcTypeKind streamType, OrcDataSource fileInput, List nestedStreams) + { + this.streamName = checkNotNull(streamName, "streamName is null"); + this.streamId = streamId; + this.fieldName = checkNotNull(fieldName, "fieldName is null"); + this.streamType = checkNotNull(streamType, "type is null"); + this.fileInput = checkNotNull(fileInput, "fileInput is null"); + this.nestedStreams = ImmutableList.copyOf(checkNotNull(nestedStreams, "nestedStreams is null")); + } + + public String getStreamName() + { + return streamName; + } + + public int getStreamId() + { + return streamId; + } + + public OrcTypeKind getStreamType() + { + return streamType; + } + + public String getFieldName() + { + return fieldName; + } + + public OrcDataSource getFileInput() + { + return fileInput; + } + + public List getNestedStreams() + { + return nestedStreams; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("streamName", streamName) + .add("streamId", streamId) + .add("streamType", streamType) + .add("path", fileInput) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java new file mode 100644 index 0000000000..3cec23c247 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java @@ -0,0 +1,77 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; + +import java.util.Objects; + +import static com.google.common.base.MoreObjects.toStringHelper; + +public final class StreamId +{ + private final int column; + private final StreamKind streamKind; + + public StreamId(Stream stream) + { + this.column = stream.getColumn(); + this.streamKind = stream.getStreamKind(); + } + + public StreamId(int column, StreamKind streamKind) + { + this.column = column; + this.streamKind = streamKind; + } + + public int getColumn() + { + return column; + } + + public StreamKind getStreamKind() + { + return streamKind; + } + + @Override + public int hashCode() + { + return Objects.hash(column, streamKind); + } + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null || getClass() != obj.getClass()) { + return false; + } + StreamId other = (StreamId) obj; + return Objects.equals(this.column, other.column) && Objects.equals(this.streamKind, other.streamKind); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("column", column) + .add("streamKind", streamKind) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java new file mode 100644 index 0000000000..a95353160e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java @@ -0,0 +1,70 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.ImmutableList; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; + +public class Stripe +{ + private final long rowCount; + private final List columnEncodings; + private final List rowGroups; + private final StreamSources dictionaryStreamSources; + + public Stripe(long rowCount, List columnEncodings, List rowGroups, StreamSources dictionaryStreamSources) + { + this.rowCount = rowCount; + this.columnEncodings = checkNotNull(columnEncodings, "columnEncodings is null"); + this.rowGroups = ImmutableList.copyOf(checkNotNull(rowGroups, "rowGroups is null")); + this.dictionaryStreamSources = checkNotNull(dictionaryStreamSources, "dictionaryStreamSources is null"); + } + + public long getRowCount() + { + return rowCount; + } + + public List getColumnEncodings() + { + return columnEncodings; + } + + public List getRowGroups() + { + return rowGroups; + } + + public StreamSources getDictionaryStreamSources() + { + return dictionaryStreamSources; + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("rowCount", rowCount) + .add("columnEncodings", columnEncodings) + .add("rowGroups", rowGroups) + .add("dictionaryStreams", dictionaryStreamSources) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java new file mode 100644 index 0000000000..1e4c4bc273 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java @@ -0,0 +1,352 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.base.Function; +import com.google.common.base.Predicates; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Maps; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.metadata.*; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.stream.*; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; + +import java.io.IOException; +import java.io.InputStream; +import java.util.*; +import java.util.Map.Entry; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.getStreamCheckpoints; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.CheckpointStreamSource.createCheckpointStreamSource; + +public class StripeReader +{ + private final OrcDataSource orcDataSource; + private final CompressionKind compressionKind; + private final List types; + private final int bufferSize; + private final Set includedOrcColumns; + private final int rowsInRowGroup; + private final OrcPredicate predicate; + private final MetadataReader metadataReader; + + public StripeReader(OrcDataSource orcDataSource, + CompressionKind compressionKind, + List types, + int bufferSize, + Set includedColumns, + int rowsInRowGroup, + OrcPredicate predicate, + MetadataReader metadataReader) + { + this.orcDataSource = checkNotNull(orcDataSource, "orcDataSource is null"); + this.compressionKind = checkNotNull(compressionKind, "compressionKind is null"); + this.types = ImmutableList.copyOf(checkNotNull(types, "types is null")); + this.bufferSize = bufferSize; + this.includedOrcColumns = getIncludedOrcColumns(types, checkNotNull(includedColumns, "includedColumns is null")); + this.rowsInRowGroup = rowsInRowGroup; + this.predicate = checkNotNull(predicate, "predicate is null"); + this.metadataReader = checkNotNull(metadataReader, "metadataReader is null"); + } + + public Stripe readStripe(StripeInformation stripe) + throws IOException + { + // read the stripe footer + StripeFooter stripeFooter = readStripeFooter(stripe); + List columnEncodings = stripeFooter.getColumnEncodings(); + + // get streams for selected columns + Map streams = new HashMap(); + for (Stream stream : stripeFooter.getStreams()) { + if (includedOrcColumns.contains(stream.getColumn())) { + streams.put(new StreamId(stream), stream); + } + } + + // determine ranges of the stripe to read + Map diskRanges = getDiskRanges(stripeFooter.getStreams()); + diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet())); + + // read the file regions + Map streamsData = readDiskRanges(stripe.getOffset(), diskRanges); + + // read the row index for each column + Map> columnIndexes = readColumnIndexes(streams, streamsData); + + // select the row groups matching the tuple domain + Set selectedRowGroups = selectRowGroups(stripe, columnIndexes); + + // if all row groups are skipped, return null + if (selectedRowGroups.isEmpty()) { + return null; + } + + // value streams + Map> valueStreams = createValueStreams(streams, streamsData, columnEncodings); + + // build the dictionary streams + StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings); + + // build the row groups + List rowGroups = createRowGroups( + stripe.getNumberOfRows(), + streams, + valueStreams, + columnIndexes, + selectedRowGroups, + columnEncodings); + + return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources); + } + + public Map readDiskRanges(final long stripeOffset, Map diskRanges) + throws IOException + { + // transform ranges to have an absolute offset in file + diskRanges = Maps.transformValues(diskRanges, new Function() { + @Override + public DiskRange apply(DiskRange diskRange) + { + return new DiskRange(stripeOffset + diskRange.getOffset(), diskRange.getLength()); + } + }); + + Map streamsData = orcDataSource.readFully(diskRanges); + + return ImmutableMap.copyOf(Maps.transformValues(streamsData, new Function() + { + @Override + public OrcInputStream apply(Slice input) + { + return new OrcInputStream(orcDataSource.toString(), input.getInput(), compressionKind, bufferSize); + } + })); + } + + private Map> createValueStreams(Map streams, Map streamsData, List columnEncodings) + { + ImmutableMap.Builder> valueStreams = ImmutableMap.builder(); + for (Entry entry : streams.entrySet()) { + StreamId streamId = entry.getKey(); + Stream stream = entry.getValue(); + ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind(); + + // skip index and empty streams + if (isIndexStream(stream) || stream.getLength() == 0) { + continue; + } + + OrcInputStream inputStream = streamsData.get(streamId); + OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); + + valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts())); + } + return valueStreams.build(); + } + + public StreamSources createDictionaryStreamSources(Map streams, Map> valueStreams, List columnEncodings) + { + ImmutableMap.Builder> dictionaryStreamBuilder = ImmutableMap.builder(); + for (Entry entry : streams.entrySet()) { + StreamId streamId = entry.getKey(); + Stream stream = entry.getValue(); + int column = stream.getColumn(); + + // only process dictionary streams + ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind(); + if (!isDictionary(stream, columnEncoding)) { + continue; + } + + // skip streams without data + ValueStream valueStream = valueStreams.get(streamId); + if (valueStream == null) { + continue; + } + + OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); + StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding); + + StreamSource streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint); + dictionaryStreamBuilder.put(streamId, streamSource); + } + return new StreamSources(dictionaryStreamBuilder.build()); + } + + private List createRowGroups( + int rowsInStripe, + Map streams, + Map> valueStreams, + Map> columnIndexes, + Set selectedRowGroups, + List encodings) + { + ImmutableList.Builder rowGroupBuilder = ImmutableList.builder(); + + for (int rowGroupId : selectedRowGroups) { + Map checkpoints = getStreamCheckpoints(includedOrcColumns, types, compressionKind, rowGroupId, encodings, streams, columnIndexes); + int rowsInGroup = Math.min(rowsInStripe - (rowGroupId * rowsInRowGroup), rowsInRowGroup); + rowGroupBuilder.add(createRowGroup(rowGroupId, rowsInGroup, valueStreams, checkpoints)); + } + + return rowGroupBuilder.build(); + } + + public static RowGroup createRowGroup(int groupId, int rowCount, Map> valueStreams, Map checkpoints) + { + ImmutableMap.Builder> builder = ImmutableMap.builder(); + for (Entry entry : checkpoints.entrySet()) { + StreamId streamId = entry.getKey(); + StreamCheckpoint checkpoint = entry.getValue(); + + // skip streams without data + ValueStream valueStream = valueStreams.get(streamId); + if (valueStream == null) { + continue; + } + + builder.put(streamId, createCheckpointStreamSource(valueStream, checkpoint)); + } + StreamSources rowGroupStreams = new StreamSources(builder.build()); + return new RowGroup(groupId, rowCount, rowGroupStreams); + } + + public StripeFooter readStripeFooter(StripeInformation stripe) + throws IOException + { + long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(); + int tailLength = Ints.checkedCast(stripe.getFooterLength()); + + // read the footer + byte[] tailBuffer = new byte[tailLength]; + orcDataSource.readFully(offset, tailBuffer); + InputStream inputStream = new OrcInputStream(orcDataSource.toString(), Slices.wrappedBuffer(tailBuffer).getInput(), compressionKind, bufferSize); + return metadataReader.readStripeFooter(types, inputStream); + } + + private Map> readColumnIndexes(Map streams, Map streamsData) + throws IOException + { + ImmutableMap.Builder> columnIndexes = ImmutableMap.builder(); + for (Entry entry : streams.entrySet()) { + Stream stream = entry.getValue(); + if (stream.getStreamKind() == ROW_INDEX) { + OrcInputStream inputStream = streamsData.get(entry.getKey()); + columnIndexes.put(stream.getColumn(), metadataReader.readRowIndexes(inputStream)); + } + } + return columnIndexes.build(); + } + + private Set selectRowGroups(StripeInformation stripe, Map> columnIndexes) + throws IOException + { + int rowsInStripe = Ints.checkedCast(stripe.getNumberOfRows()); + int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup); + + ImmutableSet.Builder selectedRowGroups = ImmutableSet.builder(); + int remainingRows = rowsInStripe; + for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) { + int rows = Math.min(remainingRows, rowsInRowGroup); + Map statistics = getRowGroupStatistics(types.get(0), columnIndexes, rowGroup); + if (predicate.matches(rows, statistics)) { + selectedRowGroups.add(rowGroup); + } + remainingRows -= rows; + } + return selectedRowGroups.build(); + } + + private static Map getRowGroupStatistics(OrcType rootStructType, Map> columnIndexes, int rowGroup) + { + checkNotNull(rootStructType, "rootStructType is null"); + checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); + checkNotNull(columnIndexes, "columnIndexes is null"); + checkArgument(rowGroup >= 0, "rowGroup is negative"); + + ImmutableMap.Builder statistics = ImmutableMap.builder(); + for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { + List rowGroupIndexes = columnIndexes.get(rootStructType.getFieldTypeIndex(ordinal)); + if (rowGroupIndexes != null) { + statistics.put(ordinal, rowGroupIndexes.get(rowGroup).getColumnStatistics()); + } + } + return statistics.build(); + } + + private static boolean isIndexStream(Stream stream) + { + return stream.getStreamKind() == ROW_INDEX || stream.getStreamKind() == DICTIONARY_COUNT; + } + + private static boolean isDictionary(Stream stream, ColumnEncodingKind columnEncoding) + { + return stream.getStreamKind() == DICTIONARY_DATA || (stream.getStreamKind() == LENGTH && (columnEncoding == DICTIONARY || columnEncoding == DICTIONARY_V2)); + } + + private static Map getDiskRanges(List streams) + { + ImmutableMap.Builder streamDiskRanges = ImmutableMap.builder(); + long stripeOffset = 0; + for (Stream stream : streams) { + int streamLength = Ints.checkedCast(stream.getLength()); + streamDiskRanges.put(new StreamId(stream), new DiskRange(stripeOffset, streamLength)); + stripeOffset += streamLength; + } + return streamDiskRanges.build(); + } + + private static Set getIncludedOrcColumns(List types, Set includedColumns) + { + Set includes = new LinkedHashSet(); + + OrcType root = types.get(0); + for (int includedColumn : includedColumns) { + includeOrcColumnsRecursive(types, includes, root.getFieldTypeIndex(includedColumn)); + } + + return includes; + } + + private static void includeOrcColumnsRecursive(List types, Set result, int typeId) + { + result.add(typeId); + OrcType type = types.get(typeId); + int children = type.getFieldCount(); + for (int i = 0; i < children; ++i) { + includeOrcColumnsRecursive(types, result, type.getFieldTypeIndex(i)); + } + } + + /** + * Ceiling of integer division + */ + private static int ceil(int dividend, int divisor) + { + return ((dividend + divisor) - 1) / divisor; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java new file mode 100644 index 0000000000..4fd403e643 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java @@ -0,0 +1,58 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; + +import static com.google.common.base.Preconditions.checkNotNull; + +public final class BooleanStreamCheckpoint + implements StreamCheckpoint +{ + private final int offset; + private final ByteStreamCheckpoint byteStreamCheckpoint; + + public BooleanStreamCheckpoint(int offset, ByteStreamCheckpoint byteStreamCheckpoint) + { + this.offset = offset; + this.byteStreamCheckpoint = checkNotNull(byteStreamCheckpoint, "byteStreamCheckpoint is null"); + } + + public BooleanStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + byteStreamCheckpoint = new ByteStreamCheckpoint(compressionKind, positionsList); + offset = positionsList.nextPosition(); + } + + public int getOffset() + { + return offset; + } + + public ByteStreamCheckpoint getByteStreamCheckpoint() + { + return byteStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("offset", offset) + .add("byteStreamCheckpoint", byteStreamCheckpoint) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java new file mode 100644 index 0000000000..a76d5c286e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java @@ -0,0 +1,50 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class ByteArrayStreamCheckpoint + implements StreamCheckpoint +{ + private final long inputStreamCheckpoint; + + public ByteArrayStreamCheckpoint(long inputStreamCheckpoint) + { + this.inputStreamCheckpoint = inputStreamCheckpoint; + } + + public ByteArrayStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java new file mode 100644 index 0000000000..c7a93ea169 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class ByteStreamCheckpoint + implements StreamCheckpoint +{ + private final int offset; + private final long inputStreamCheckpoint; + + public ByteStreamCheckpoint(int offset, long inputStreamCheckpoint) + { + this.offset = offset; + this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); + } + + public ByteStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + offset = positionsList.nextPosition(); + } + + public int getOffset() + { + return offset; + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("offset", offset) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java new file mode 100644 index 0000000000..f346235d94 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java @@ -0,0 +1,405 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import com.google.common.collect.*; +import org.apache.tajo.storage.thirdparty.orc.StreamId; +import org.apache.tajo.storage.thirdparty.orc.metadata.*; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; + +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.base.Preconditions.checkState; +import static com.google.common.base.Predicates.equalTo; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; + +public final class Checkpoints +{ + private Checkpoints() + { + } + + public static Map getStreamCheckpoints( + Set columns, + List columnTypes, + CompressionKind compressionKind, + int rowGroupId, + List columnEncodings, + Map streams, + Map> columnIndexes) + { + ImmutableSetMultimap.Builder streamKindsBuilder = ImmutableSetMultimap.builder(); + for (Stream stream : streams.values()) { + streamKindsBuilder.put(stream.getColumn(), stream.getStreamKind()); + } + SetMultimap streamKinds = streamKindsBuilder.build(); + + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + for (int column : columns) { + List positionsList = columnIndexes.get(column).get(rowGroupId).getPositions(); + + ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind(); + OrcTypeKind columnType = columnTypes.get(column).getOrcTypeKind(); + Set availableStreams = streamKinds.get(column); + + ColumnPositionsList columnPositionsList = new ColumnPositionsList(column, columnType, positionsList); + switch (columnType) { + case BOOLEAN: + checkpoints.putAll(getBooleanColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); + break; + case BYTE: + checkpoints.putAll(getByteColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); + break; + case SHORT: + case INT: + case LONG: + case DATE: + checkpoints.putAll(getLongColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); + break; + case FLOAT: + checkpoints.putAll(getFloatColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); + break; + case DOUBLE: + checkpoints.putAll(getDoubleColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); + break; + case TIMESTAMP: + checkpoints.putAll(getTimestampColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); + break; + case BINARY: + case STRING: + checkpoints.putAll(getSliceColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); + break; + case LIST: + case MAP: + checkpoints.putAll(getListOrMapColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); + break; + case STRUCT: + checkpoints.putAll(getStructColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); + break; + case DECIMAL: + case CHAR: + case VARCHAR: + case UNION: + throw new IllegalArgumentException("Unsupported column type " + columnType); + } + + // The DWRF code is not meticulous in the handling of checkpoints. It appears that for the first row group + // it will write checkpoints for all streams, but in other cases it will write only the streams that exist. + // We detect this case by checking that all offsets in the initial position list are zero, and if so, we + // clear the extra offsets + checkState(!columnPositionsList.hasNextPosition() || Iterables.all(positionsList, equalTo(0)), + "Column %s, of type %s, contains %s offset positions, but only %s positions were consumed", + column, + columnType, + positionsList.size(), + columnPositionsList.getIndex()); + } + return checkpoints.build(); + } + + public static StreamCheckpoint getDictionaryStreamCheckpoint(StreamId streamId, OrcTypeKind columnType, ColumnEncodingKind columnEncoding) + { + if (streamId.getStreamKind() == DICTIONARY_DATA) { + switch (columnType) { + case SHORT: + case INT: + case LONG: + return new LongStreamDwrfCheckpoint(createInputStreamCheckpoint(0, 0)); + case STRING: + case VARCHAR: + case CHAR: + case BINARY: + return new ByteArrayStreamCheckpoint(createInputStreamCheckpoint(0, 0)); + } + } + + // dictionary length and data streams are unsigned long streams + if (streamId.getStreamKind() == LENGTH || streamId.getStreamKind() == DATA) { + if (columnEncoding == DICTIONARY_V2) { + return new LongStreamV2Checkpoint(0, createInputStreamCheckpoint(0, 0)); + } + else if (columnEncoding == DICTIONARY) { + return new LongStreamV1Checkpoint(0, createInputStreamCheckpoint(0, 0)); + } + } + throw new IllegalArgumentException("Unsupported column type " + columnType + " for dictionary stream " + streamId); + } + + private static Map getBooleanColumnCheckpoints( + int column, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getByteColumnCheckpoints( + int column, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), new ByteStreamCheckpoint(compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getLongColumnCheckpoints( + int column, + ColumnEncodingKind encoding, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(IN_DICTIONARY)) { + checkpoints.put(new StreamId(column, IN_DICTIONARY), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getFloatColumnCheckpoints( + int column, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), new FloatStreamCheckpoint(compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getDoubleColumnCheckpoints( + int column, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), new DoubleStreamCheckpoint(compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getTimestampColumnCheckpoints( + int column, + ColumnEncodingKind encoding, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + + if (availableStreams.contains(SECONDARY)) { + checkpoints.put(new StreamId(column, SECONDARY), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getSliceColumnCheckpoints( + int column, + ColumnEncodingKind encoding, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (encoding == DIRECT || encoding == DIRECT_V2) { + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), new ByteArrayStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(LENGTH)) { + checkpoints.put(new StreamId(column, LENGTH), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + } + else if (encoding == DICTIONARY || encoding == DICTIONARY_V2) { + // DWRF has rules inconsistent with the ORC style + if (availableStreams.contains(IN_DICTIONARY)) { + if (availableStreams.contains(ROW_GROUP_DICTIONARY)) { + checkpoints.put(new StreamId(column, ROW_GROUP_DICTIONARY), new ByteArrayStreamCheckpoint(compressionKind, positionsList)); + } + + checkpoints.put(new StreamId(column, ROW_GROUP_DICTIONARY_LENGTH), new RowGroupDictionaryLengthStreamCheckpoint(compressionKind, positionsList)); + + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + + checkpoints.put(new StreamId(column, IN_DICTIONARY), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + else { + if (availableStreams.contains(DATA)) { + checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + } + } + else { + throw new IllegalArgumentException("Unsupported encoding for slice column: " + encoding); + } + + return checkpoints.build(); + } + + private static Map getListOrMapColumnCheckpoints( + int column, + ColumnEncodingKind encoding, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + if (availableStreams.contains(LENGTH)) { + checkpoints.put(new StreamId(column, LENGTH), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static Map getStructColumnCheckpoints( + int column, + CompressionKind compressionKind, + Set availableStreams, + ColumnPositionsList positionsList) + { + ImmutableMap.Builder checkpoints = ImmutableMap.builder(); + + if (availableStreams.contains(PRESENT)) { + checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); + } + + return checkpoints.build(); + } + + private static StreamCheckpoint createLongStreamCheckpoint(ColumnEncodingKind encoding, CompressionKind compressionKind, ColumnPositionsList positionsList) + { + if (encoding == DIRECT_V2 || encoding == DICTIONARY_V2) { + return new LongStreamV2Checkpoint(compressionKind, positionsList); + } + + if (encoding == DIRECT || encoding == DICTIONARY) { + return new LongStreamV1Checkpoint(compressionKind, positionsList); + } + + if (encoding == DWRF_DIRECT) { + return new LongStreamDwrfCheckpoint(compressionKind, positionsList); + } + + throw new IllegalArgumentException("Unsupported encoding for long stream: " + encoding); + } + + public static class ColumnPositionsList + { + private final int column; + private final OrcTypeKind columnType; + private final List positionsList; + private int index; + + private ColumnPositionsList(int column, OrcTypeKind columnType, List positionsList) + { + this.column = column; + this.columnType = checkNotNull(columnType, "columnType is null"); + this.positionsList = ImmutableList.copyOf(checkNotNull(positionsList, "positionsList is null")); + } + + public int getIndex() + { + return index; + } + + public boolean hasNextPosition() + { + return index < positionsList.size(); + } + + public int nextPosition() + { + checkState(hasNextPosition(), "Not enough positions for column %s, of type %s, checkpoints", + column, + columnType); + + return positionsList.get(index++); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java new file mode 100644 index 0000000000..80f03de1d9 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java @@ -0,0 +1,50 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class DoubleStreamCheckpoint + implements StreamCheckpoint +{ + private final long inputStreamCheckpoint; + + public DoubleStreamCheckpoint(long inputStreamCheckpoint) + { + this.inputStreamCheckpoint = inputStreamCheckpoint; + } + + public DoubleStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java new file mode 100644 index 0000000000..2d92cd3494 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java @@ -0,0 +1,50 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class FloatStreamCheckpoint + implements StreamCheckpoint +{ + private final long inputStreamCheckpoint; + + public FloatStreamCheckpoint(long inputStreamCheckpoint) + { + this.inputStreamCheckpoint = inputStreamCheckpoint; + } + + public FloatStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java new file mode 100644 index 0000000000..92550a6b91 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java @@ -0,0 +1,64 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; + +import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.UNCOMPRESSED; + +/** + * InputStreamCheckpoint is represented as a packed long to avoid object creation in inner loops. + */ +public final class InputStreamCheckpoint +{ + private InputStreamCheckpoint() + { + } + + public static long createInputStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + if (compressionKind == UNCOMPRESSED) { + return createInputStreamCheckpoint(0, positionsList.nextPosition()); + } + else { + return createInputStreamCheckpoint(positionsList.nextPosition(), positionsList.nextPosition()); + } + } + + public static long createInputStreamCheckpoint(int compressedBlockOffset, int decompressedOffset) + { + return (((long) compressedBlockOffset) << 32) | decompressedOffset; + } + + public static int decodeCompressedBlockOffset(long inputStreamCheckpoint) + { + return ((int) (inputStreamCheckpoint >> 32)); + } + + public static int decodeDecompressedOffset(long inputStreamCheckpoint) + { + // low order bits contain the decompressed offset, so a simple cast here will suffice + return (int) inputStreamCheckpoint; + } + + public static String inputStreamCheckpointToString(long inputStreamCheckpoint) + { + return MoreObjects.toStringHelper(InputStreamCheckpoint.class) + .add("decompressedOffset", decodeDecompressedOffset(inputStreamCheckpoint)) + .add("compressedBlockOffset", decodeCompressedBlockOffset(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java new file mode 100644 index 0000000000..bb08edd940 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java @@ -0,0 +1,50 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class LongStreamDwrfCheckpoint + implements LongStreamCheckpoint +{ + private final long inputStreamCheckpoint; + + public LongStreamDwrfCheckpoint(long inputStreamCheckpoint) + { + this.inputStreamCheckpoint = inputStreamCheckpoint; + } + + public LongStreamDwrfCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java new file mode 100644 index 0000000000..410f181d38 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public class LongStreamV1Checkpoint + implements LongStreamCheckpoint +{ + private final int offset; + private final long inputStreamCheckpoint; + + public LongStreamV1Checkpoint(int offset, long inputStreamCheckpoint) + { + this.offset = offset; + this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); + } + + public LongStreamV1Checkpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + offset = positionsList.nextPosition(); + } + + public int getOffset() + { + return offset; + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("offset", offset) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java new file mode 100644 index 0000000000..352c4d1bc1 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java @@ -0,0 +1,60 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class LongStreamV2Checkpoint + implements LongStreamCheckpoint +{ + private final int offset; + private final long inputStreamCheckpoint; + + public LongStreamV2Checkpoint(int offset, long inputStreamCheckpoint) + { + this.offset = offset; + this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); + } + + public LongStreamV2Checkpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); + offset = positionsList.nextPosition(); + } + + public int getOffset() + { + return offset; + } + + public long getInputStreamCheckpoint() + { + return inputStreamCheckpoint; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("offset", offset) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java new file mode 100644 index 0000000000..88ac0515e5 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java @@ -0,0 +1,53 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.checkpoint; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; + +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; + +public final class RowGroupDictionaryLengthStreamCheckpoint + extends LongStreamV1Checkpoint +{ + private final int rowGroupDictionarySize; + + public RowGroupDictionaryLengthStreamCheckpoint(int rowGroupDictionarySize, int offset, long inputStreamCheckpoint) + { + super(offset, inputStreamCheckpoint); + this.rowGroupDictionarySize = rowGroupDictionarySize; + } + + public RowGroupDictionaryLengthStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) + { + super(compressionKind, positionsList); + rowGroupDictionarySize = positionsList.nextPosition(); + } + + public int getRowGroupDictionarySize() + { + return rowGroupDictionarySize; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("rowGroupDictionarySize", rowGroupDictionarySize) + .add("offset", getOffset()) + .add("inputStreamCheckpoint", inputStreamCheckpointToString(getInputStreamCheckpoint())) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java new file mode 100644 index 0000000000..65182d49bd --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java @@ -0,0 +1,117 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class BooleanJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private BooleanStream dataStream; + + public BooleanJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + generator.writeBoolean(dataStream.nextBit()); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + return String.valueOf(dataStream.nextBit()); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null values + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, BooleanStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java new file mode 100644 index 0000000000..d1008528a1 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java @@ -0,0 +1,118 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.ByteStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class ByteJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private ByteStream dataStream; + + public ByteJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + generator.writeNumber(dataStream.next()); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + return String.valueOf(dataStream.next()); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null values + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java new file mode 100644 index 0000000000..3243ead772 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java @@ -0,0 +1,123 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class DateJsonReader + implements JsonMapKeyReader +{ + private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1); + + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private LongStream dataStream; + + public DateJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + long millis = dataStream.next() * MILLIS_IN_DAY; + generator.writeNumber(millis); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + long millis = dataStream.next() * MILLIS_IN_DAY; + return String.valueOf(millis); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null values + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java new file mode 100644 index 0000000000..1adf00aeec --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java @@ -0,0 +1,120 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.DoubleStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class DoubleJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private DoubleStream dataStream; + + public DoubleJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + double value = dataStream.next(); + generator.writeNumber(value); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + double value = dataStream.next(); + return String.valueOf(value); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null values + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, DoubleStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java new file mode 100644 index 0000000000..0b4f668dff --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java @@ -0,0 +1,122 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.FloatStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class FloatJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private FloatStream dataStream; + + public FloatJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // write value as a double to avoid strange rounding errors + double value = dataStream.next(); + generator.writeNumber(value); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // write value as a double to avoid strange rounding errors + double value = dataStream.next(); + return String.valueOf(value); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null values + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, FloatStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java new file mode 100644 index 0000000000..6e93f8abb2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java @@ -0,0 +1,23 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import java.io.IOException; + +public interface JsonMapKeyReader + extends JsonReader +{ + String nextValueAsMapKey() + throws IOException; +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java new file mode 100644 index 0000000000..f35cbe6d82 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.io.IOException; +import java.util.List; + +public interface JsonReader +{ + void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException; + + void openRowGroup(StreamSources dataStreamSources) + throws IOException; + + void readNextValueInto(JsonGenerator generator) + throws IOException; + + void skip(int skipSize) + throws IOException; +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java new file mode 100644 index 0000000000..06019757d2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java @@ -0,0 +1,100 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.joda.time.DateTimeZone; + +public final class JsonReaders +{ + private JsonReaders() + { + } + + public static JsonMapKeyReader createJsonMapKeyReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) + { + switch (streamDescriptor.getStreamType()) { + case BOOLEAN: + return new BooleanJsonReader(streamDescriptor); + case BYTE: + return new ByteJsonReader(streamDescriptor); + case SHORT: + case INT: + case LONG: + return new LongJsonReader(streamDescriptor); + case FLOAT: + return new FloatJsonReader(streamDescriptor); + case DOUBLE: + return new DoubleJsonReader(streamDescriptor); + case BINARY: + return new SliceJsonReader(streamDescriptor, true); + case STRING: + return new SliceJsonReader(streamDescriptor, false); + case TIMESTAMP: + return new TimestampJsonReader(streamDescriptor, hiveStorageTimeZone); + case DATE: + return new DateJsonReader(streamDescriptor); + case STRUCT: + case LIST: + case MAP: + case UNION: + case DECIMAL: + case VARCHAR: + case CHAR: + default: + throw new IllegalArgumentException("Unsupported map key type: " + streamDescriptor.getStreamType()); + } + } + + public static JsonReader createJsonReader( + StreamDescriptor streamDescriptor, + boolean checkForNulls, + DateTimeZone hiveStorageTimeZone) + { + switch (streamDescriptor.getStreamType()) { + case BOOLEAN: + return new BooleanJsonReader(streamDescriptor); + case BYTE: + return new ByteJsonReader(streamDescriptor); + case SHORT: + case INT: + case LONG: + return new LongJsonReader(streamDescriptor); + case FLOAT: + return new FloatJsonReader(streamDescriptor); + case DOUBLE: + return new DoubleJsonReader(streamDescriptor); + case BINARY: + return new SliceJsonReader(streamDescriptor, true); + case STRING: + return new SliceJsonReader(streamDescriptor, false); + case TIMESTAMP: + return new TimestampJsonReader(streamDescriptor, hiveStorageTimeZone); + case DATE: + return new DateJsonReader(streamDescriptor); + case STRUCT: + return new StructJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); + case LIST: + return new ListJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); + case MAP: + return new MapJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); + case UNION: + case DECIMAL: + case VARCHAR: + case CHAR: + default: + throw new IllegalArgumentException("Unsupported type: " + streamDescriptor.getStreamType()); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java new file mode 100644 index 0000000000..d6302fb8b5 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java @@ -0,0 +1,125 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.LENGTH; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class ListJsonReader + implements JsonReader +{ + private final StreamDescriptor streamDescriptor; + private final boolean checkForNulls; + + private final JsonReader elementReader; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private LongStream lengthStream; + + public ListJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.checkForNulls = checkForNulls; + + elementReader = createJsonReader(streamDescriptor.getNestedStreams().get(0), true, hiveStorageTimeZone); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + long length = lengthStream.next(); + generator.writeStartArray(); + for (int i = 0; i < length; i++) { + elementReader.readNextValueInto(generator); + } + generator.writeEndArray(); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + long elementSkipSize = lengthStream.sum(skipSize); + elementReader.skip(Ints.checkedCast(elementSkipSize)); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + lengthStream = null; + + elementReader.openStripe(dictionaryStreamSources, encoding); + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + if (checkForNulls) { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + } + lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); + + elementReader.openRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java new file mode 100644 index 0000000000..b26fc9ab5b --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java @@ -0,0 +1,142 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; + +public class LongDictionaryJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + @Nullable + private BooleanStream inDictionaryStream; + @Nullable + private LongStream dataStream; + + @Nonnull + private long[] dictionary = new long[0]; + + public LongDictionaryJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + generator.writeNumber(nextValue()); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + return String.valueOf(nextValue()); + } + + private long nextValue() + throws IOException + { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + long value = dataStream.next(); + if (inDictionaryStream == null || inDictionaryStream.nextBit()) { + value = dictionary[((int) value)]; + } + return value; + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + // skip non-null values + if (inDictionaryStream != null) { + inDictionaryStream.skip(skipSize); + } + if (skipSize > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.skip(skipSize); + } + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + int dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); + if (dictionarySize > 0) { + if (dictionary.length < dictionarySize) { + dictionary = new long[dictionarySize]; + } + + LongStream dictionaryStream = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, LongStream.class).openStream(); + verifyFormat(dictionaryStream != null, "Dictionary is not empty but data stream is not present"); + dictionaryStream.nextLongVector(dictionarySize, dictionary); + } + + presentStream = null; + inDictionaryStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + inDictionaryStream = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java new file mode 100644 index 0000000000..b6edb82db2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java @@ -0,0 +1,112 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class LongDirectJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + @Nullable + private BooleanStream presentStream; + @Nullable + private LongStream dataStream; + + public LongDirectJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + generator.writeNumber(dataStream.next()); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + return String.valueOf(dataStream.next()); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + // skip non-null values + if (skipSize > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.skip(skipSize); + } + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java new file mode 100644 index 0000000000..4793a11280 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java @@ -0,0 +1,99 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; + +public class LongJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + private final LongDirectJsonReader directReader; + + private final LongDictionaryJsonReader dictionaryReader; + private JsonMapKeyReader currentReader; + + public LongJsonReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + directReader = new LongDirectJsonReader(streamDescriptor); + dictionaryReader = new LongDictionaryJsonReader(streamDescriptor); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + currentReader.readNextValueInto(generator); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + return currentReader.nextValueAsMapKey(); + } + + @Override + public void skip(int skipSize) + throws IOException + { + currentReader.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + ColumnEncodingKind kind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); + if (kind == DIRECT || kind == DIRECT_V2 || kind == DWRF_DIRECT) { + currentReader = directReader; + } + else if (kind == DICTIONARY || kind == DICTIONARY_V2) { + currentReader = dictionaryReader; + } + else { + throw new IllegalArgumentException("Unsupported encoding " + kind); + } + + currentReader.openStripe(dictionaryStreamSources, encoding); + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + currentReader.openRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java new file mode 100644 index 0000000000..5b6b73b055 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java @@ -0,0 +1,138 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonMapKeyReader; +import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.LENGTH; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class MapJsonReader + implements JsonReader +{ + private final StreamDescriptor streamDescriptor; + private final boolean checkForNulls; + + private final JsonMapKeyReader keyReader; + private final JsonReader valueReader; + + @Nullable + private BooleanStream presentStream; + @Nullable + private LongStream lengthStream; + + public MapJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.checkForNulls = checkForNulls; + + keyReader = createJsonMapKeyReader(streamDescriptor.getNestedStreams().get(0), hiveStorageTimeZone); + valueReader = createJsonReader(streamDescriptor.getNestedStreams().get(1), true, hiveStorageTimeZone); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + long length = lengthStream.next(); + generator.writeStartObject(); + for (int i = 0; i < length; i++) { + String name = keyReader.nextValueAsMapKey(); + if (name == null) { + valueReader.skip(1); + } + else { + generator.writeFieldName(name); + valueReader.readNextValueInto(generator); + } + } + generator.writeEndObject(); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + // skip non-null values + long elementSkipSize = lengthStream.sum(skipSize); + keyReader.skip(Ints.checkedCast(elementSkipSize)); + valueReader.skip(Ints.checkedCast(elementSkipSize)); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + lengthStream = null; + + keyReader.openStripe(dictionaryStreamSources, encoding); + valueReader.openStripe(dictionaryStreamSources, encoding); + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + if (checkForNulls) { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + } + lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); + + keyReader.openRowGroup(dataStreamSources); + valueReader.openRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java new file mode 100644 index 0000000000..bf7cb6fc13 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java @@ -0,0 +1,269 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.google.common.io.BaseEncoding; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.*; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static java.nio.charset.StandardCharsets.UTF_8; + +public class SliceDictionaryJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + private final boolean writeBinary; + + @Nonnull + private DictionaryEntry[] dictionary = new DictionaryEntry[0]; + + @Nonnull + private int[] dictionaryLength = new int[0]; + + @Nonnull + private DictionaryEntry[] rowGroupDictionary = new DictionaryEntry[0]; + + @Nonnull + private int[] rowGroupDictionaryLength = new int[0]; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private BooleanStream inDictionaryStream; + + @Nullable + private LongStream dataStream; + + public SliceDictionaryJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.writeBinary = writeBinary; + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + DictionaryEntry value = getNextValue(); + + byte[] data = value.getData(); + int offset = value.getOffset(); + int length = value.length(); + if (writeBinary) { + generator.writeBinary(data, offset, length); + } + else { + generator.writeUTF8String(data, offset, length); + } + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + DictionaryEntry value = getNextValue(); + + byte[] data = value.getData(); + int offset = value.getOffset(); + int length = value.length(); + if (writeBinary) { + return BaseEncoding.base64().encode(data, offset, length); + } + else { + return new String(data, offset, length, UTF_8); + } + } + + private DictionaryEntry getNextValue() + throws IOException + { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + int dictionaryIndex = Ints.checkedCast(dataStream.next()); + + DictionaryEntry value; + if (inDictionaryStream == null || inDictionaryStream.nextBit()) { + value = dictionary[dictionaryIndex]; + } + else { + value = rowGroupDictionary[dictionaryIndex]; + } + return value; + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + + // skip non-null length + if (inDictionaryStream != null) { + inDictionaryStream.skip(skipSize); + } + dataStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + int dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); + if (dictionarySize > 0) { + // resize the dictionary array if necessary + if (dictionary.length < dictionarySize) { + dictionary = new DictionaryEntry[dictionarySize]; + dictionaryLength = new int[dictionarySize]; + } + + LongStream lengthStream = dictionaryStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); + verifyFormat(lengthStream != null, "Dictionary is not empty but length stream is not present"); + lengthStream.nextIntVector(dictionarySize, dictionaryLength); + + ByteArrayStream dictionaryDataStream = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, ByteArrayStream.class).openStream(); + readDictionary(dictionaryDataStream, dictionarySize, dictionaryLength, dictionary); + } + + presentStream = null; + dataStream = null; + inDictionaryStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + RowGroupDictionaryLengthStream lengthStream = dataStreamSources.getStreamSource( + streamDescriptor, + ROW_GROUP_DICTIONARY_LENGTH, + RowGroupDictionaryLengthStream.class).openStream(); + + if (lengthStream == null) { + inDictionaryStream = null; + } + else { + inDictionaryStream = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class).openStream(); + + int dictionaryEntryCount = lengthStream.getEntryCount(); + + // resize the dictionary array if necessary + if (rowGroupDictionary.length < dictionaryEntryCount) { + rowGroupDictionary = new DictionaryEntry[dictionaryEntryCount]; + rowGroupDictionaryLength = new int[dictionaryEntryCount]; + } + + // read the lengths + lengthStream.nextIntVector(dictionaryEntryCount, rowGroupDictionaryLength); + + ByteArrayStream dictionaryDataStream = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY, ByteArrayStream.class).openStream(); + readDictionary(dictionaryDataStream, dictionaryEntryCount, rowGroupDictionaryLength, rowGroupDictionary); + } + + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); + } + + private static void readDictionary(ByteArrayStream dictionaryDataStream, int dictionarySize, int[] dictionaryLength, DictionaryEntry[] dictionary) + throws IOException + { + // sum lengths + int totalLength = 0; + for (int i = 0; i < dictionarySize; i++) { + totalLength += dictionaryLength[i]; + } + + // read dictionary data + byte[] dictionaryData = new byte[0]; + if (totalLength > 0) { + verifyFormat(dictionaryDataStream != null, "Dictionary length is not zero but dictionary data stream is not present"); + dictionaryData = dictionaryDataStream.next(totalLength); + } + + // build dictionary slices + int offset = 0; + for (int i = 0; i < dictionarySize; i++) { + int length = dictionaryLength[i]; + dictionary[i] = new DictionaryEntry(dictionaryData, offset, length); + offset += length; + } + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } + + private static class DictionaryEntry + { + private final byte[] dictionary; + private final int offset; + private final int length; + + public DictionaryEntry(byte[] dictionary, int offset, int length) + { + this.dictionary = dictionary; + this.offset = offset; + this.length = length; + } + + public int length() + { + return length; + } + + public byte[] getData() + { + return dictionary; + } + + public int getOffset() + { + return offset; + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java new file mode 100644 index 0000000000..6f6630c59e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java @@ -0,0 +1,168 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import com.google.common.io.BaseEncoding; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.ByteArrayStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static java.nio.charset.StandardCharsets.UTF_8; + +public class SliceDirectJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + private final boolean writeBinary; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private LongStream lengthStream; + + @Nullable + private ByteArrayStream dataStream; + + @Nonnull + private byte[] data = new byte[1024]; + + public SliceDirectJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.writeBinary = writeBinary; + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + int length = bufferNextValue(); + + if (writeBinary) { + generator.writeBinary(data, 0, length); + } + else { + generator.writeUTF8String(data, 0, length); + } + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + int length = bufferNextValue(); + + if (writeBinary) { + return BaseEncoding.base64().encode(data, 0, length); + } + else { + return new String(data, 0, length, UTF_8); + } + } + + private int bufferNextValue() + throws IOException + { + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + int length = Ints.checkedCast(lengthStream.next()); + if (data.length < length) { + data = new byte[length]; + } + + if (length > 0) { + verifyFormat(dataStream != null, "Length is not zero but data stream is not present"); + dataStream.next(length, data); + } + return length; + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + + // skip non-null length + long dataSkipSize = lengthStream.sum(skipSize); + + if (dataSkipSize == 0) { + return; + } + + verifyFormat(dataStream != null, "Length is not zero but data stream is not present"); + + // skip data bytes + dataStream.skip(Ints.checkedCast(dataSkipSize)); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + lengthStream = null; + dataStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); + dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteArrayStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java new file mode 100644 index 0000000000..68892ca244 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java @@ -0,0 +1,98 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; + +public class SliceJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + private final SliceDirectJsonReader directReader; + private final SliceDictionaryJsonReader dictionaryReader; + private JsonMapKeyReader currentReader; + + public SliceJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + directReader = new SliceDirectJsonReader(streamDescriptor, writeBinary); + dictionaryReader = new SliceDictionaryJsonReader(streamDescriptor, writeBinary); + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + currentReader.readNextValueInto(generator); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + return currentReader.nextValueAsMapKey(); + } + + @Override + public void skip(int skipSize) + throws IOException + { + currentReader.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, + List encoding) + throws IOException + { + ColumnEncodingKind columnEncodingKind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); + if (columnEncodingKind == DIRECT || columnEncodingKind == DIRECT_V2 || columnEncodingKind == ColumnEncodingKind.DWRF_DIRECT) { + currentReader = directReader; + } + else if (columnEncodingKind == DICTIONARY || columnEncodingKind == DICTIONARY_V2) { + currentReader = dictionaryReader; + } + else { + throw new IllegalArgumentException("Unsupported encoding " + columnEncodingKind); + } + + currentReader.openStripe(dictionaryStreamSources, encoding); + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + currentReader.openRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java new file mode 100644 index 0000000000..600b7b778d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java @@ -0,0 +1,117 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; + +public class StructJsonReader + implements JsonReader +{ + private final StreamDescriptor streamDescriptor; + private final boolean checkForNulls; + private final JsonReader[] structFields; + + @Nullable + private BooleanStream presentStream; + + public StructJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.checkForNulls = checkForNulls; + + List nestedStreams = streamDescriptor.getNestedStreams(); + this.structFields = new JsonReader[nestedStreams.size()]; + for (int i = 0; i < nestedStreams.size(); i++) { + StreamDescriptor nestedStream = nestedStreams.get(i); + this.structFields[i] = createJsonReader(nestedStream, true, hiveStorageTimeZone); + } + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + generator.writeStartArray(); + for (JsonReader structField : structFields) { + structField.readNextValueInto(generator); + } + generator.writeEndArray(); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + // skip non-null values + for (JsonReader structField : structFields) { + structField.skip(skipSize); + } + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + + for (JsonReader structField : structFields) { + structField.openStripe(dictionaryStreamSources, encoding); + } + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + if (checkForNulls) { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + } + + for (JsonReader structField : structFields) { + structField.openRowGroup(dataStreamSources); + } + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java new file mode 100644 index 0000000000..bfebf78658 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java @@ -0,0 +1,134 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.json; + +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.reader.TimestampStreamReader.decodeTimestamp; + +public class TimestampJsonReader + implements JsonMapKeyReader +{ + private final StreamDescriptor streamDescriptor; + + private final long baseTimestampInSeconds; + + @Nullable + private BooleanStream presentStream; + + @Nullable + private LongStream secondsStream; + + @Nullable + private LongStream nanosStream; + + public TimestampJsonReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.baseTimestampInSeconds = new DateTime(2015, 1, 1, 0, 0, checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null")).getMillis() / 1000; + } + + @Override + public void readNextValueInto(JsonGenerator generator) + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + generator.writeNull(); + return; + } + + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); + + long timestamp = decodeTimestamp(secondsStream.next(), nanosStream.next(), baseTimestampInSeconds); + generator.writeNumber(timestamp); + } + + @Override + public String nextValueAsMapKey() + throws IOException + { + if (presentStream != null && !presentStream.nextBit()) { + return null; + } + + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); + + long timestamp = decodeTimestamp(secondsStream.next(), nanosStream.next(), baseTimestampInSeconds); + return String.valueOf(timestamp); + } + + @Override + public void skip(int skipSize) + throws IOException + { + // skip nulls + if (presentStream != null) { + skipSize = presentStream.countBitsSet(skipSize); + } + + if (skipSize == 0) { + return; + } + + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); + + // skip non-null values + secondsStream.skip(skipSize); + nanosStream.skip(skipSize); + } + + @Override + public void openStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStream = null; + secondsStream = null; + nanosStream = null; + } + + @Override + public void openRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); + secondsStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); + nanosStream = dataStreamSources.getStreamSource(streamDescriptor, SECONDARY, LongStream.class).openStream(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java new file mode 100644 index 0000000000..20ae97058e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java @@ -0,0 +1,367 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import com.facebook.hive.orc.OrcProto; +import com.facebook.hive.orc.OrcProto.ColumnEncoding.Kind; +import com.google.common.base.Function; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; +import com.google.common.primitives.Ints; +import com.google.protobuf.CodedInputStream; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; + +public class DwrfMetadataReader + implements MetadataReader +{ + @Override + public PostScript readPostScript(byte[] data, int offset, int length) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(data, offset, length); + OrcProto.PostScript postScript = OrcProto.PostScript.parseFrom(input); + + return new PostScript( + ImmutableList.of(), + postScript.getFooterLength(), + 0, + toCompression(postScript.getCompression()), + postScript.getCompressionBlockSize()); + } + + @Override + public Metadata readMetadata(InputStream inputStream) + throws IOException + { + return new Metadata(ImmutableList.of()); + } + + @Override + public Footer readFooter(InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.Footer footer = OrcProto.Footer.parseFrom(input); + return new Footer( + footer.getNumberOfRows(), + footer.getRowIndexStride(), + toStripeInformation(footer.getStripesList()), + toType(footer.getTypesList()), + toColumnStatistics(footer.getStatisticsList(), false)); + } + + private static List toStripeInformation(List types) + { + return ImmutableList.copyOf(Iterables.transform(types, new Function() + { + @Override + public StripeInformation apply(OrcProto.StripeInformation type) + { + return toStripeInformation(type); + } + })); + } + + private static StripeInformation toStripeInformation(OrcProto.StripeInformation stripeInformation) + { + return new StripeInformation( + Ints.checkedCast(stripeInformation.getNumberOfRows()), + stripeInformation.getOffset(), + stripeInformation.getIndexLength(), + stripeInformation.getDataLength(), + stripeInformation.getFooterLength()); + } + + @Override + public StripeFooter readStripeFooter(List types, InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.StripeFooter stripeFooter = OrcProto.StripeFooter.parseFrom(input); + return new StripeFooter(toStream(stripeFooter.getStreamsList()), toColumnEncoding(types, stripeFooter.getColumnsList())); + } + + private static Stream toStream(OrcProto.Stream stream) + { + return new Stream(stream.getColumn(), toStreamKind(stream.getKind()), Ints.checkedCast(stream.getLength()), stream.getUseVInts()); + } + + private static List toStream(List streams) + { + return ImmutableList.copyOf(Iterables.transform(streams, new Function() + { + @Override + public Stream apply(OrcProto.Stream stream) + { + return toStream(stream); + } + })); + } + + private static ColumnEncoding toColumnEncoding(OrcTypeKind type, OrcProto.ColumnEncoding columnEncoding) + { + return new ColumnEncoding(toColumnEncodingKind(type, columnEncoding.getKind()), columnEncoding.getDictionarySize()); + } + + private static List toColumnEncoding(List types, List columnEncodings) + { + checkArgument(types.size() == columnEncodings.size()); + + ImmutableList.Builder encodings = ImmutableList.builder(); + for (int i = 0; i < types.size(); i++) { + OrcType type = types.get(i); + encodings.add(toColumnEncoding(type.getOrcTypeKind(), columnEncodings.get(i))); + } + return encodings.build(); + } + + @Override + public List readRowIndexes(InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.RowIndex rowIndex = OrcProto.RowIndex.parseFrom(input); + return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() + { + @Override + public RowGroupIndex apply(OrcProto.RowIndexEntry rowIndexEntry) + { + return toRowGroupIndex(rowIndexEntry); + } + })); + } + + private static RowGroupIndex toRowGroupIndex(OrcProto.RowIndexEntry rowIndexEntry) + { + List positionsList = rowIndexEntry.getPositionsList(); + ImmutableList.Builder positions = ImmutableList.builder(); + for (int index = 0; index < positionsList.size(); index++) { + long longPosition = positionsList.get(index); + int intPosition = (int) longPosition; + + checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index); + + positions.add(intPosition); + } + return new RowGroupIndex(positions.build(), toColumnStatistics(rowIndexEntry.getStatistics(), true)); + } + + private static List toColumnStatistics(List columnStatistics, final boolean isRowGroup) + { + if (columnStatistics == null) { + return ImmutableList.of(); + } + return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() + { + @Override + public ColumnStatistics apply(OrcProto.ColumnStatistics columnStatistics) + { + return toColumnStatistics(columnStatistics, isRowGroup); + } + })); + } + + private static ColumnStatistics toColumnStatistics(OrcProto.ColumnStatistics statistics, boolean isRowGroup) + { + return new ColumnStatistics( + statistics.getNumberOfValues(), + toBooleanStatistics(statistics.getBucketStatistics()), + toIntegerStatistics(statistics.getIntStatistics()), + toDoubleStatistics(statistics.getDoubleStatistics()), + toStringStatistics(statistics.getStringStatistics(), isRowGroup), + null); + } + + private static BooleanStatistics toBooleanStatistics(OrcProto.BucketStatistics bucketStatistics) + { + if (bucketStatistics.getCountCount() == 0) { + return null; + } + + return new BooleanStatistics(bucketStatistics.getCount(0)); + } + + private static IntegerStatistics toIntegerStatistics(OrcProto.IntegerStatistics integerStatistics) + { + if (!integerStatistics.hasMinimum() && !integerStatistics.hasMaximum()) { + return null; + } + + return new IntegerStatistics( + integerStatistics.hasMinimum() ? integerStatistics.getMinimum() : null, + integerStatistics.hasMaximum() ? integerStatistics.getMaximum() : null); + } + + private static DoubleStatistics toDoubleStatistics(OrcProto.DoubleStatistics doubleStatistics) + { + if (!doubleStatistics.hasMinimum() && !doubleStatistics.hasMaximum()) { + return null; + } + + // TODO remove this when double statistics are changed to correctly deal with NaNs + // if either min or max is NaN, ignore the stat + if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) || + (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum()))) { + return null; + } + + return new DoubleStatistics( + doubleStatistics.hasMinimum() ? doubleStatistics.getMinimum() : null, + doubleStatistics.hasMaximum() ? doubleStatistics.getMaximum() : null); + } + + private static StringStatistics toStringStatistics(OrcProto.StringStatistics stringStatistics, boolean isRowGroup) + { + // TODO remove this when string statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 + if (!isRowGroup) { + return null; + } + + if (!stringStatistics.hasMinimum() && !stringStatistics.hasMaximum()) { + return null; + } + + // temporarily disable string statistics until we figure out the implications of how UTF-16 + // strings are compared when they contain surrogate pairs and replacement characters + if (true) { + return null; + } + + return new StringStatistics( + stringStatistics.hasMinimum() ? stringStatistics.getMinimum() : null, + stringStatistics.hasMaximum() ? stringStatistics.getMaximum() : null); + } + + private static OrcType toType(OrcProto.Type type) + { + return new OrcType(toTypeKind(type.getKind()), type.getSubtypesList(), type.getFieldNamesList()); + } + + private static List toType(List types) + { + return ImmutableList.copyOf(Iterables.transform(types, new Function() + { + @Override + public OrcType apply(OrcProto.Type type) + { + return toType(type); + } + })); + } + + private static OrcTypeKind toTypeKind(OrcProto.Type.Kind kind) + { + switch (kind) { + case BOOLEAN: + return OrcTypeKind.BOOLEAN; + case BYTE: + return OrcTypeKind.BYTE; + case SHORT: + return OrcTypeKind.SHORT; + case INT: + return OrcTypeKind.INT; + case LONG: + return OrcTypeKind.LONG; + case FLOAT: + return OrcTypeKind.FLOAT; + case DOUBLE: + return OrcTypeKind.DOUBLE; + case STRING: + return OrcTypeKind.STRING; + case BINARY: + return OrcTypeKind.BINARY; + case TIMESTAMP: + return OrcTypeKind.TIMESTAMP; + case LIST: + return OrcTypeKind.LIST; + case MAP: + return OrcTypeKind.MAP; + case STRUCT: + return OrcTypeKind.STRUCT; + case UNION: + return OrcTypeKind.UNION; + default: + throw new IllegalArgumentException(kind + " data type not implemented yet"); + } + } + + private static StreamKind toStreamKind(OrcProto.Stream.Kind kind) + { + switch (kind) { + case PRESENT: + return StreamKind.PRESENT; + case DATA: + return StreamKind.DATA; + case LENGTH: + return StreamKind.LENGTH; + case DICTIONARY_DATA: + return StreamKind.DICTIONARY_DATA; + case DICTIONARY_COUNT: + return StreamKind.DICTIONARY_COUNT; + case NANO_DATA: + return StreamKind.SECONDARY; + case ROW_INDEX: + return StreamKind.ROW_INDEX; + case IN_DICTIONARY: + return StreamKind.IN_DICTIONARY; + case STRIDE_DICTIONARY: + return StreamKind.ROW_GROUP_DICTIONARY; + case STRIDE_DICTIONARY_LENGTH: + return StreamKind.ROW_GROUP_DICTIONARY_LENGTH; + default: + throw new IllegalArgumentException(kind + " stream type not implemented yet"); + } + } + + private static ColumnEncodingKind toColumnEncodingKind(OrcTypeKind type, Kind kind) + { + switch (kind) { + case DIRECT: + if (type == OrcTypeKind.SHORT || type == OrcTypeKind.INT || type == OrcTypeKind.LONG) { + return ColumnEncodingKind.DWRF_DIRECT; + } + else { + return ColumnEncodingKind.DIRECT; + } + case DICTIONARY: + return ColumnEncodingKind.DICTIONARY; + default: + throw new IllegalArgumentException(kind + " stream encoding not implemented yet"); + } + } + + private static CompressionKind toCompression(OrcProto.CompressionKind compression) + { + switch (compression) { + case NONE: + return UNCOMPRESSED; + case ZLIB: + return ZLIB; + case SNAPPY: + return SNAPPY; + default: + throw new IllegalArgumentException(compression + " compression not implemented yet"); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java new file mode 100644 index 0000000000..38bae8b8f2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java @@ -0,0 +1,402 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +import com.facebook.presto.hive.shaded.com.google.protobuf.CodedInputStream; +import com.google.common.base.Function; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; +import org.apache.hadoop.hive.ql.io.orc.OrcProto; +import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; + +import static com.google.common.base.Preconditions.checkState; +import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; + +public class OrcMetadataReader + implements MetadataReader +{ + @Override + public PostScript readPostScript(byte[] data, int offset, int length) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(data, offset, length); + OrcProto.PostScript postScript = OrcProto.PostScript.parseFrom(input); + + return new PostScript( + postScript.getVersionList(), + postScript.getFooterLength(), + postScript.getMetadataLength(), + toCompression(postScript.getCompression()), + postScript.getCompressionBlockSize()); + } + + @Override + public Metadata readMetadata(InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.Metadata metadata = OrcProto.Metadata.parseFrom(input); + return new Metadata(toStripeStatistics(metadata.getStripeStatsList())); + } + + private static List toStripeStatistics(List types) + { + return ImmutableList.copyOf(Iterables.transform(types, new Function() + { + @Override + public StripeStatistics apply(OrcProto.StripeStatistics type) + { + return toStripeStatistics(type); + } + })); + } + + private static StripeStatistics toStripeStatistics(OrcProto.StripeStatistics stripeStatistics) + { + return new StripeStatistics(toColumnStatistics(stripeStatistics.getColStatsList(), false)); + } + + @Override + public Footer readFooter(InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.Footer footer = OrcProto.Footer.parseFrom(input); + return new Footer( + footer.getNumberOfRows(), + footer.getRowIndexStride(), + toStripeInformation(footer.getStripesList()), + toType(footer.getTypesList()), + toColumnStatistics(footer.getStatisticsList(), false)); + } + + private static List toStripeInformation(List types) + { + return ImmutableList.copyOf(Iterables.transform(types, new Function() + { + @Override + public StripeInformation apply(OrcProto.StripeInformation type) + { + return toStripeInformation(type); + } + })); + } + + private static StripeInformation toStripeInformation(OrcProto.StripeInformation stripeInformation) + { + return new StripeInformation( + Ints.checkedCast(stripeInformation.getNumberOfRows()), + stripeInformation.getOffset(), + stripeInformation.getIndexLength(), + stripeInformation.getDataLength(), + stripeInformation.getFooterLength()); + } + + @Override + public StripeFooter readStripeFooter(List types, InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.StripeFooter stripeFooter = OrcProto.StripeFooter.parseFrom(input); + return new StripeFooter(toStream(stripeFooter.getStreamsList()), toColumnEncoding(stripeFooter.getColumnsList())); + } + + private static Stream toStream(OrcProto.Stream stream) + { + return new Stream(stream.getColumn(), toStreamKind(stream.getKind()), Ints.checkedCast(stream.getLength()), true); + } + + private static List toStream(List streams) + { + return ImmutableList.copyOf(Iterables.transform(streams, new Function() + { + @Override + public Stream apply(OrcProto.Stream stream) + { + return toStream(stream); + } + })); + } + + private static ColumnEncoding toColumnEncoding(OrcProto.ColumnEncoding columnEncoding) + { + return new ColumnEncoding(toColumnEncodingKind(columnEncoding.getKind()), columnEncoding.getDictionarySize()); + } + + private static List toColumnEncoding(List columnEncodings) + { + return ImmutableList.copyOf(Iterables.transform(columnEncodings, new Function() + { + @Override + public ColumnEncoding apply(OrcProto.ColumnEncoding columnEncoding) + { + return toColumnEncoding(columnEncoding); + } + })); + } + + @Override + public List readRowIndexes(InputStream inputStream) + throws IOException + { + CodedInputStream input = CodedInputStream.newInstance(inputStream); + OrcProto.RowIndex rowIndex = OrcProto.RowIndex.parseFrom(input); + return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() + { + @Override + public RowGroupIndex apply(RowIndexEntry rowIndexEntry) + { + return toRowGroupIndex(rowIndexEntry); + } + })); + } + + private static RowGroupIndex toRowGroupIndex(RowIndexEntry rowIndexEntry) + { + List positionsList = rowIndexEntry.getPositionsList(); + ImmutableList.Builder positions = ImmutableList.builder(); + for (int index = 0; index < positionsList.size(); index++) { + long longPosition = positionsList.get(index); + int intPosition = (int) longPosition; + + checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index); + + positions.add(intPosition); + } + return new RowGroupIndex(positions.build(), toColumnStatistics(rowIndexEntry.getStatistics(), true)); + } + + private static ColumnStatistics toColumnStatistics(OrcProto.ColumnStatistics statistics, boolean isRowGroup) + { + return new ColumnStatistics( + statistics.getNumberOfValues(), + toBooleanStatistics(statistics.getBucketStatistics()), + toIntegerStatistics(statistics.getIntStatistics()), + toDoubleStatistics(statistics.getDoubleStatistics()), + toStringStatistics(statistics.getStringStatistics(), isRowGroup), + toDateStatistics(statistics.getDateStatistics(), isRowGroup)); + } + + private static List toColumnStatistics(List columnStatistics, final boolean isRowGroup) + { + if (columnStatistics == null) { + return ImmutableList.of(); + } + return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() + { + @Override + public ColumnStatistics apply(OrcProto.ColumnStatistics columnStatistics) + { + return toColumnStatistics(columnStatistics, isRowGroup); + } + })); + } + + private static BooleanStatistics toBooleanStatistics(OrcProto.BucketStatistics bucketStatistics) + { + if (bucketStatistics.getCountCount() == 0) { + return null; + } + + return new BooleanStatistics(bucketStatistics.getCount(0)); + } + + private static IntegerStatistics toIntegerStatistics(OrcProto.IntegerStatistics integerStatistics) + { + if (!integerStatistics.hasMinimum() && !integerStatistics.hasMaximum()) { + return null; + } + + return new IntegerStatistics( + integerStatistics.hasMinimum() ? integerStatistics.getMinimum() : null, + integerStatistics.hasMaximum() ? integerStatistics.getMaximum() : null); + } + + private static DoubleStatistics toDoubleStatistics(OrcProto.DoubleStatistics doubleStatistics) + { + if (!doubleStatistics.hasMinimum() && !doubleStatistics.hasMaximum()) { + return null; + } + + // TODO remove this when double statistics are changed to correctly deal with NaNs + // if either min or max is NaN, ignore the stat + if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) || + (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum()))) { + return null; + } + + return new DoubleStatistics( + doubleStatistics.hasMinimum() ? doubleStatistics.getMinimum() : null, + doubleStatistics.hasMaximum() ? doubleStatistics.getMaximum() : null); + } + + private static StringStatistics toStringStatistics(OrcProto.StringStatistics stringStatistics, boolean isRowGroup) + { + // TODO remove this when string statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 + if (!isRowGroup) { + return null; + } + + if (!stringStatistics.hasMinimum() && !stringStatistics.hasMaximum()) { + return null; + } + + return new StringStatistics( + stringStatistics.hasMinimum() ? stringStatistics.getMinimum() : null, + stringStatistics.hasMaximum() ? stringStatistics.getMaximum() : null); + } + + private static DateStatistics toDateStatistics(OrcProto.DateStatistics dateStatistics, boolean isRowGroup) + { + // TODO remove this when date statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 + if (!isRowGroup) { + return null; + } + + if (!dateStatistics.hasMinimum() && !dateStatistics.hasMaximum()) { + return null; + } + + // temporarily disable string statistics until we figure out the implications of how UTF-16 + // strings are compared when they contain surrogate pairs and replacement characters + if (true) { + return null; + } + + return new DateStatistics( + dateStatistics.hasMinimum() ? dateStatistics.getMinimum() : null, + dateStatistics.hasMaximum() ? dateStatistics.getMaximum() : null); + } + + private static OrcType toType(OrcProto.Type type) + { + return new OrcType(toTypeKind(type.getKind()), type.getSubtypesList(), type.getFieldNamesList()); + } + + private static List toType(List types) + { + return ImmutableList.copyOf(Iterables.transform(types, new Function() + { + @Override + public OrcType apply(OrcProto.Type type) + { + return toType(type); + } + })); + } + + private static OrcTypeKind toTypeKind(OrcProto.Type.Kind typeKind) + { + switch (typeKind) { + case BOOLEAN: + return OrcTypeKind.BOOLEAN; + case BYTE: + return OrcTypeKind.BYTE; + case SHORT: + return OrcTypeKind.SHORT; + case INT: + return OrcTypeKind.INT; + case LONG: + return OrcTypeKind.LONG; + case FLOAT: + return OrcTypeKind.FLOAT; + case DOUBLE: + return OrcTypeKind.DOUBLE; + case STRING: + return OrcTypeKind.STRING; + case BINARY: + return OrcTypeKind.BINARY; + case TIMESTAMP: + return OrcTypeKind.TIMESTAMP; + case LIST: + return OrcTypeKind.LIST; + case MAP: + return OrcTypeKind.MAP; + case STRUCT: + return OrcTypeKind.STRUCT; + case UNION: + return OrcTypeKind.UNION; + case DECIMAL: + return OrcTypeKind.DECIMAL; + case DATE: + return OrcTypeKind.DATE; + case VARCHAR: + return OrcTypeKind.VARCHAR; + case CHAR: + return OrcTypeKind.CHAR; + default: + throw new IllegalStateException(typeKind + " stream type not implemented yet"); + } + } + + private static StreamKind toStreamKind(OrcProto.Stream.Kind streamKind) + { + switch (streamKind) { + case PRESENT: + return StreamKind.PRESENT; + case DATA: + return StreamKind.DATA; + case LENGTH: + return StreamKind.LENGTH; + case DICTIONARY_DATA: + return StreamKind.DICTIONARY_DATA; + case DICTIONARY_COUNT: + return StreamKind.DICTIONARY_COUNT; + case SECONDARY: + return StreamKind.SECONDARY; + case ROW_INDEX: + return StreamKind.ROW_INDEX; + default: + throw new IllegalStateException(streamKind + " stream type not implemented yet"); + } + } + + private static ColumnEncodingKind toColumnEncodingKind(OrcProto.ColumnEncoding.Kind columnEncodingKind) + { + switch (columnEncodingKind) { + case DIRECT: + return ColumnEncodingKind.DIRECT; + case DIRECT_V2: + return ColumnEncodingKind.DIRECT_V2; + case DICTIONARY: + return ColumnEncodingKind.DICTIONARY; + case DICTIONARY_V2: + return ColumnEncodingKind.DICTIONARY_V2; + default: + throw new IllegalStateException(columnEncodingKind + " stream encoding not implemented yet"); + } + } + + private static CompressionKind toCompression(OrcProto.CompressionKind compression) + { + switch (compression) { + case NONE: + return UNCOMPRESSED; + case ZLIB: + return ZLIB; + case SNAPPY: + return SNAPPY; + default: + throw new IllegalStateException(compression + " compression not implemented yet"); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java new file mode 100644 index 0000000000..17cb8ba289 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java @@ -0,0 +1,39 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.metadata; + +public class StringStatistics + implements RangeStatistics +{ + private final String minimum; + private final String maximum; + + public StringStatistics(String minimum, String maximum) + { + this.minimum = minimum; + this.maximum = maximum; + } + + @Override + public String getMin() + { + return minimum; + } + + @Override + public String getMax() + { + return maximum; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java new file mode 100644 index 0000000000..cb38b2ed6e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java @@ -0,0 +1,153 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.BooleanVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class BooleanStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream dataStream; + + private boolean rowGroupOpen; + + public BooleanStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.skip(readOffset); + } + } + + BooleanVector booleanVector = (BooleanVector) vector; + if (presentStream == null) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + Arrays.fill(booleanVector.isNull, false); + dataStream.getSetBits(nextBatchSize, booleanVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, booleanVector.isNull); + if (nullValues != nextBatchSize) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.getSetBits(nextBatchSize, booleanVector.vector, booleanVector.isNull); + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(BooleanStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, BooleanStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java new file mode 100644 index 0000000000..3688d2fce2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java @@ -0,0 +1,155 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.LongVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.ByteStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class ByteStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(ByteStream.class); + @Nullable + private ByteStream dataStream; + + private boolean rowGroupOpen; + + public ByteStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.skip(readOffset); + } + } + + LongVector byteVector = (LongVector) vector; + if (presentStream == null) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + Arrays.fill(byteVector.isNull, false); + dataStream.nextVector(nextBatchSize, byteVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, byteVector.isNull); + if (nullValues != nextBatchSize) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.nextVector(nextBatchSize, byteVector.vector, byteVector.isNull); + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(ByteStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java new file mode 100644 index 0000000000..afca11996d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java @@ -0,0 +1,155 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.DoubleVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.DoubleStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class DoubleStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(DoubleStream.class); + @Nullable + private DoubleStream dataStream; + + private boolean rowGroupOpen; + + public DoubleStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.skip(readOffset); + } + } + + DoubleVector doubleVector = (DoubleVector) vector; + if (presentStream == null) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + Arrays.fill(doubleVector.isNull, false); + dataStream.nextVector(nextBatchSize, doubleVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, doubleVector.isNull); + if (nullValues != nextBatchSize) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.nextVector(nextBatchSize, doubleVector.vector, doubleVector.isNull); + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(DoubleStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, DoubleStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java new file mode 100644 index 0000000000..8d75390337 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java @@ -0,0 +1,156 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.DoubleVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.FloatStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class FloatStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(FloatStream.class); + @Nullable + private FloatStream dataStream; + + private boolean rowGroupOpen; + + public FloatStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.skip(readOffset); + } + } + + // we could add a float vector but Presto currently doesn't support floats + DoubleVector floatVector = (DoubleVector) vector; + if (presentStream == null) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + Arrays.fill(floatVector.isNull, false); + dataStream.nextVector(nextBatchSize, floatVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, floatVector.isNull); + if (nullValues != nextBatchSize) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.nextVector(nextBatchSize, floatVector.vector, floatVector.isNull); + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(FloatStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, FloatStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java new file mode 100644 index 0000000000..8048e61335 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java @@ -0,0 +1,180 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.core.JsonGenerator; +import org.apache.tajo.storage.thirdparty.orc.SliceVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.json.JsonReader; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import io.airlift.slice.DynamicSliceOutput; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class JsonStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + private final JsonReader jsonReader; + + private boolean stripeOpen; + private boolean rowGroupOpen; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; + + private int readOffset; + private int nextBatchSize; + + @Nullable + private StreamSources dictionaryStreamSources; + @Nullable + private StreamSources dataStreamSources; + + private List encoding; + + public JsonStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.jsonReader = createJsonReader(streamDescriptor, false, hiveStorageTimeZone); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the length reader + readOffset = presentStream.countBitsSet(readOffset); + } + + jsonReader.skip(readOffset); + } + + SliceVector sliceVector = (SliceVector) vector; + if (presentStream != null) { + presentStream.getUnsetBits(nextBatchSize, isNullVector); + } + + DynamicSliceOutput out = new DynamicSliceOutput(1024); + for (int i = 0; i < nextBatchSize; i++) { + if (!isNullVector[i]) { + out.reset(); + JsonGenerator generator = new JsonFactory().createGenerator(out); + jsonReader.readNextValueInto(generator); + sliceVector.vector[i] = out.copySlice(); + } + else { + sliceVector.vector[i] = null; + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + + if (!stripeOpen) { + jsonReader.openStripe(dictionaryStreamSources, encoding); + } + + jsonReader.openRowGroup(dataStreamSources); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + this.dictionaryStreamSources = dictionaryStreamSources; + this.dataStreamSources = null; + this.encoding = encoding; + + presentStreamSource = missingStreamSource(BooleanStream.class); + + stripeOpen = false; + rowGroupOpen = false; + + readOffset = 0; + nextBatchSize = 0; + + Arrays.fill(isNullVector, false); + + presentStream = null; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + this.dataStreamSources = dataStreamSources; + + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + + rowGroupOpen = false; + + readOffset = 0; + nextBatchSize = 0; + + Arrays.fill(isNullVector, false); + + presentStream = null; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java new file mode 100644 index 0000000000..bd847f6efd --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java @@ -0,0 +1,210 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.LongVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class LongDictionaryStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dictionaryDataStreamSource = missingStreamSource(LongStream.class); + private int dictionarySize; + @Nonnull + private long[] dictionary = new long[0]; + + @Nonnull + private StreamSource inDictionaryStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream inDictionaryStream; + private final boolean[] inDictionary = new boolean[Vector.MAX_VECTOR_LENGTH]; + + @Nonnull + private StreamSource dataStreamSource; + @Nullable + private LongStream dataStream; + + private boolean dictionaryOpen; + private boolean rowGroupOpen; + + public LongDictionaryStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the length reader + readOffset = presentStream.countBitsSet(readOffset); + } + + if (inDictionaryStream != null) { + inDictionaryStream.skip(readOffset); + } + + if (readOffset > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.skip(readOffset); + } + } + + LongVector longVector = (LongVector) vector; + + if (presentStream == null) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + Arrays.fill(longVector.isNull, false); + dataStream.nextLongVector(nextBatchSize, longVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); + if (nullValues != nextBatchSize) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); + } + } + + if (inDictionaryStream == null) { + Arrays.fill(inDictionary, true); + } + else { + inDictionaryStream.getSetBits(nextBatchSize, inDictionary, longVector.isNull); + } + + for (int i = 0; i < nextBatchSize; i++) { + if (!longVector.isNull[i]) { + if (inDictionary[i]) { + longVector.vector[i] = dictionary[((int) longVector.vector[i])]; + } + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + // read the dictionary + if (!dictionaryOpen && dictionarySize > 0) { + if (dictionary.length < dictionarySize) { + dictionary = new long[dictionarySize]; + } + + LongStream dictionaryStream = dictionaryDataStreamSource.openStream(); + verifyFormat(dictionaryStream != null, "Dictionary is not empty but data stream is not present"); + dictionaryStream.nextLongVector(dictionarySize, dictionary); + } + dictionaryOpen = true; + + presentStream = presentStreamSource.openStream(); + inDictionaryStream = inDictionaryStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + dictionaryDataStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, LongStream.class); + dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); + dictionaryOpen = false; + + inDictionaryStreamSource = missingStreamSource(BooleanStream.class); + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + inDictionaryStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + inDictionaryStreamSource = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + inDictionaryStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java new file mode 100644 index 0000000000..b50201cc0e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java @@ -0,0 +1,155 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.LongVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class LongDirectStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(LongStream.class); + @Nullable + private LongStream dataStream; + + private boolean rowGroupOpen; + + public LongDirectStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.skip(readOffset); + } + } + + LongVector longVector = (LongVector) vector; + if (presentStream == null) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + Arrays.fill(longVector.isNull, false); + dataStream.nextLongVector(nextBatchSize, longVector.vector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); + if (nullValues != nextBatchSize) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java new file mode 100644 index 0000000000..6943049acd --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java @@ -0,0 +1,88 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; + +public class LongStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + private final LongDirectStreamReader directReader; + private final LongDictionaryStreamReader dictionaryReader; + private StreamReader currentReader; + + public LongStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + directReader = new LongDirectStreamReader(streamDescriptor); + dictionaryReader = new LongDictionaryStreamReader(streamDescriptor); + } + + @Override + public void prepareNextRead(int batchSize) + { + currentReader.prepareNextRead(batchSize); + } + + @Override + public void readBatch(Object vector) + throws IOException + { + currentReader.readBatch(vector); + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + ColumnEncodingKind kind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); + if (kind == DIRECT || kind == DIRECT_V2 || kind == DWRF_DIRECT) { + currentReader = directReader; + } + else if (kind == DICTIONARY) { + currentReader = dictionaryReader; + } + else { + throw new IllegalArgumentException("Unsupported encoding " + kind); + } + + currentReader.startStripe(dictionaryStreamSources, encoding); + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + currentReader.startRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java new file mode 100644 index 0000000000..bf7f362be5 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java @@ -0,0 +1,287 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.SliceVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.*; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class SliceDictionaryStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; + + @Nonnull + private StreamSource dictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); + private boolean dictionaryOpen; + private int dictionarySize; + @Nonnull + private Slice[] dictionary = new Slice[0]; + + @Nonnull + private StreamSource dictionaryLengthStreamSource = missingStreamSource(LongStream.class); + @Nonnull + private int[] dictionaryLength = new int[0]; + + @Nonnull + private StreamSource inDictionaryStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream inDictionaryStream; + private final boolean[] inDictionary = new boolean[Vector.MAX_VECTOR_LENGTH]; + + @Nonnull + private StreamSource rowGroupDictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); + @Nonnull + private Slice[] rowGroupDictionary = new Slice[0]; + + @Nonnull + private StreamSource rowGroupDictionaryLengthStreamSource = missingStreamSource(RowGroupDictionaryLengthStream.class); + @Nonnull + private int[] rowGroupDictionaryLength = new int[0]; + + @Nonnull + private StreamSource dataStreamSource = missingStreamSource(LongStream.class); + @Nullable + private LongStream dataStream; + @Nonnull + private final int[] dataVector = new int[Vector.MAX_VECTOR_LENGTH]; + + private boolean rowGroupOpen; + + public SliceDictionaryStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the length reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + if (inDictionaryStream != null) { + inDictionaryStream.skip(readOffset); + } + dataStream.skip(readOffset); + } + } + + SliceVector sliceVector = (SliceVector) vector; + + if (presentStream == null) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + Arrays.fill(isNullVector, false); + dataStream.nextIntVector(nextBatchSize, dataVector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, isNullVector); + if (nullValues != nextBatchSize) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.nextIntVector(nextBatchSize, dataVector, isNullVector); + } + } + + if (inDictionaryStream == null) { + Arrays.fill(inDictionary, true); + } + else { + inDictionaryStream.getSetBits(nextBatchSize, inDictionary, isNullVector); + } + + for (int i = 0; i < nextBatchSize; i++) { + if (isNullVector[i]) { + sliceVector.vector[i] = null; + } + else if (inDictionary[i]) { + sliceVector.vector[i] = dictionary[dataVector[i]]; + } + else { + sliceVector.vector[i] = rowGroupDictionary[dataVector[i]]; + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + // read the dictionary + if (!dictionaryOpen && dictionarySize > 0) { + // resize the dictionary array if necessary + if (dictionary.length < dictionarySize) { + dictionary = new Slice[dictionarySize]; + dictionaryLength = new int[dictionarySize]; + } + + // read the lengths + LongStream lengthStream = dictionaryLengthStreamSource.openStream(); + verifyFormat(lengthStream != null, "Dictionary is not empty but dictionary length stream is not present"); + lengthStream.nextIntVector(dictionarySize, dictionaryLength); + + ByteArrayStream dictionaryDataStream = dictionaryDataStreamSource.openStream(); + readDictionary(dictionaryDataStream, dictionarySize, dictionaryLength, dictionary); + } + dictionaryOpen = true; + + // read row group dictionary + RowGroupDictionaryLengthStream dictionaryLengthStream = rowGroupDictionaryLengthStreamSource.openStream(); + if (dictionaryLengthStream != null) { + int rowGroupDictionarySize = dictionaryLengthStream.getEntryCount(); + + // resize the dictionary array if necessary + if (rowGroupDictionary.length < rowGroupDictionarySize) { + rowGroupDictionary = new Slice[rowGroupDictionarySize]; + rowGroupDictionaryLength = new int[rowGroupDictionarySize]; + } + + // read the lengths + dictionaryLengthStream.nextIntVector(rowGroupDictionarySize, rowGroupDictionaryLength); + + ByteArrayStream dictionaryDataStream = rowGroupDictionaryDataStreamSource.openStream(); + readDictionary(dictionaryDataStream, rowGroupDictionarySize, rowGroupDictionaryLength, rowGroupDictionary); + } + dictionaryOpen = true; + + presentStream = presentStreamSource.openStream(); + inDictionaryStream = inDictionaryStreamSource.openStream(); + dataStream = dataStreamSource.openStream(); + + rowGroupOpen = true; + } + + private static void readDictionary(@Nullable ByteArrayStream dictionaryDataStream, int dictionarySize, int[] dictionaryLength, Slice[] dictionary) + throws IOException + { + // sum lengths + int totalLength = 0; + for (int i = 0; i < dictionarySize; i++) { + totalLength += dictionaryLength[i]; + } + + // read dictionary data + byte[] dictionaryData = new byte[0]; + if (totalLength > 0) { + verifyFormat(dictionaryDataStream != null, "Dictionary length is not zero but dictionary data stream is not present"); + dictionaryData = dictionaryDataStream.next(totalLength); + } + + // build dictionary slices + int offset = 0; + for (int i = 0; i < dictionarySize; i++) { + int length = dictionaryLength[i]; + dictionary[i] = Slices.wrappedBuffer(dictionaryData, offset, length); + offset += length; + } + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + dictionaryDataStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, ByteArrayStream.class); + dictionaryLengthStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class); + dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); + dictionaryOpen = false; + + presentStreamSource = missingStreamSource(BooleanStream.class); + dataStreamSource = missingStreamSource(LongStream.class); + + inDictionaryStreamSource = missingStreamSource(BooleanStream.class); + rowGroupDictionaryLengthStreamSource = missingStreamSource(RowGroupDictionaryLengthStream.class); + rowGroupDictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + inDictionaryStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); + + // the "in dictionary" stream signals if the value is in the stripe or row group dictionary + inDictionaryStreamSource = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class); + rowGroupDictionaryLengthStreamSource = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY_LENGTH, RowGroupDictionaryLengthStream.class); + rowGroupDictionaryDataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY, ByteArrayStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + inDictionaryStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java new file mode 100644 index 0000000000..994b25d29a --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java @@ -0,0 +1,198 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.SliceVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.*; +import io.airlift.slice.Slices; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class SliceDirectStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; + + @Nonnull + private StreamSource lengthStreamSource = missingStreamSource(LongStream.class); + @Nullable + private LongStream lengthStream; + private final int[] lengthVector = new int[Vector.MAX_VECTOR_LENGTH]; + + @Nonnull + private StreamSource dataByteSource = missingStreamSource(ByteArrayStream.class); + @Nullable + private ByteArrayStream dataStream; + + private boolean rowGroupOpen; + + public SliceDirectStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the length reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + long dataSkipSize = lengthStream.sum(readOffset); + if (dataSkipSize > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + dataStream.skip(Ints.checkedCast(dataSkipSize)); + } + } + } + + SliceVector sliceVector = (SliceVector) vector; + if (presentStream == null) { + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + lengthStream.nextIntVector(nextBatchSize, lengthVector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, isNullVector); + if (nullValues != nextBatchSize) { + verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); + lengthStream.nextIntVector(nextBatchSize, lengthVector, isNullVector); + } + } + + int totalLength = 0; + for (int i = 0; i < nextBatchSize; i++) { + if (!isNullVector[i]) { + totalLength += lengthVector[i]; + } + } + + byte[] data = new byte[0]; + if (totalLength > 0) { + verifyFormat(dataStream != null, "Value is not null but data stream is not present"); + data = dataStream.next(totalLength); + } + + int offset = 0; + for (int i = 0; i < nextBatchSize; i++) { + if (!isNullVector[i]) { + int length = lengthVector[i]; + sliceVector.vector[i] = Slices.wrappedBuffer(data, offset, length); + offset += length; + } + else { + sliceVector.vector[i] = null; + } + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + lengthStream = lengthStreamSource.openStream(); + dataStream = dataByteSource.openStream(); + + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + lengthStreamSource = missingStreamSource(LongStream.class); + dataByteSource = missingStreamSource(ByteArrayStream.class); + + readOffset = 0; + nextBatchSize = 0; + + Arrays.fill(isNullVector, false); + + presentStream = null; + lengthStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + lengthStreamSource = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class); + dataByteSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteArrayStream.class); + + readOffset = 0; + nextBatchSize = 0; + + Arrays.fill(isNullVector, false); + + presentStream = null; + lengthStream = null; + dataStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java new file mode 100644 index 0000000000..e046dff632 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java @@ -0,0 +1,88 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; + +import java.io.IOException; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; + +public class SliceStreamReader + implements StreamReader +{ + private final StreamDescriptor streamDescriptor; + private final SliceDirectStreamReader directReader; + private final SliceDictionaryStreamReader dictionaryReader; + private StreamReader currentReader; + + public SliceStreamReader(StreamDescriptor streamDescriptor) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + directReader = new SliceDirectStreamReader(streamDescriptor); + dictionaryReader = new SliceDictionaryStreamReader(streamDescriptor); + } + + @Override + public void readBatch(Object vector) + throws IOException + { + currentReader.readBatch(vector); + } + + @Override + public void prepareNextRead(int batchSize) + { + currentReader.prepareNextRead(batchSize); + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + ColumnEncodingKind columnEncodingKind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); + if (columnEncodingKind == DIRECT || columnEncodingKind == DIRECT_V2 || columnEncodingKind == DWRF_DIRECT) { + currentReader = directReader; + } + else if (columnEncodingKind == DICTIONARY || columnEncodingKind == DICTIONARY_V2) { + currentReader = dictionaryReader; + } + else { + throw new IllegalArgumentException("Unsupported encoding " + columnEncodingKind); + } + + currentReader.startStripe(dictionaryStreamSources, encoding); + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + currentReader.startRowGroup(dataStreamSources); + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java new file mode 100644 index 0000000000..7d0e8cc9f2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java @@ -0,0 +1,58 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.joda.time.DateTimeZone; + +public final class StreamReaders +{ + private StreamReaders() + { + } + + public static StreamReader createStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) + { + switch (streamDescriptor.getStreamType()) { + case BOOLEAN: + return new BooleanStreamReader(streamDescriptor); + case BYTE: + return new ByteStreamReader(streamDescriptor); + case SHORT: + case INT: + case LONG: + case DATE: + return new LongStreamReader(streamDescriptor); + case FLOAT: + return new FloatStreamReader(streamDescriptor); + case DOUBLE: + return new DoubleStreamReader(streamDescriptor); + case BINARY: + case STRING: + return new SliceStreamReader(streamDescriptor); + case TIMESTAMP: + return new TimestampStreamReader(streamDescriptor, hiveStorageTimeZone); + case STRUCT: + case LIST: + case MAP: + return new JsonStreamReader(streamDescriptor, hiveStorageTimeZone); + case UNION: + case DECIMAL: + case VARCHAR: + case CHAR: + default: + throw new IllegalArgumentException("Unsupported type: " + streamDescriptor.getStreamType()); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java new file mode 100644 index 0000000000..ba96f7cdcb --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java @@ -0,0 +1,217 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.reader; + +import org.apache.tajo.storage.thirdparty.orc.LongVector; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; +import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; +import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; +import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; +import org.joda.time.DateTime; +import org.joda.time.DateTimeZone; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; + +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class TimestampStreamReader + implements StreamReader +{ + private static final int MILLIS_PER_SECOND = 1000; + + private final StreamDescriptor streamDescriptor; + private final long baseTimestampInSeconds; + + private int readOffset; + private int nextBatchSize; + + @Nonnull + private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); + @Nullable + private BooleanStream presentStream; + + @Nonnull + private StreamSource secondsStreamSource = missingStreamSource(LongStream.class); + @Nullable + private LongStream secondsStream; + + @Nonnull + private StreamSource nanosStreamSource = missingStreamSource(LongStream.class); + @Nullable + private LongStream nanosStream; + + private final long[] nanosVector = new long[Vector.MAX_VECTOR_LENGTH]; + + private boolean rowGroupOpen; + + public TimestampStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) + { + this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); + this.baseTimestampInSeconds = new DateTime(2015, 1, 1, 0, 0, checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null")).getMillis() / MILLIS_PER_SECOND; + } + + @Override + public void prepareNextRead(int batchSize) + { + readOffset += nextBatchSize; + nextBatchSize = batchSize; + } + + @Override + public void readBatch(Object vector) + throws IOException + { + if (!rowGroupOpen) { + openRowGroup(); + } + + if (readOffset > 0) { + if (presentStream != null) { + // skip ahead the present bit reader, but count the set bits + // and use this as the skip size for the data reader + readOffset = presentStream.countBitsSet(readOffset); + } + if (readOffset > 0) { + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); + + secondsStream.skip(readOffset); + nanosStream.skip(readOffset); + } + } + + LongVector longVector = (LongVector) vector; + if (presentStream == null) { + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); + + Arrays.fill(longVector.isNull, false); + secondsStream.nextLongVector(nextBatchSize, longVector.vector); + nanosStream.nextLongVector(nextBatchSize, nanosVector); + } + else { + int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); + if (nullValues != nextBatchSize) { + verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); + verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); + + secondsStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); + nanosStream.nextLongVector(nextBatchSize, nanosVector, longVector.isNull); + } + } + + // merge seconds and nanos together + for (int i = 0; i < nextBatchSize; i++) { + longVector.vector[i] = decodeTimestamp(longVector.vector[i], nanosVector[i], baseTimestampInSeconds); + } + + readOffset = 0; + nextBatchSize = 0; + } + + private void openRowGroup() + throws IOException + { + presentStream = presentStreamSource.openStream(); + secondsStream = secondsStreamSource.openStream(); + nanosStream = nanosStreamSource.openStream(); + rowGroupOpen = true; + } + + @Override + public void startStripe(StreamSources dictionaryStreamSources, List encoding) + throws IOException + { + presentStreamSource = missingStreamSource(BooleanStream.class); + secondsStreamSource = missingStreamSource(LongStream.class); + nanosStreamSource = missingStreamSource(LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + secondsStream = null; + nanosStream = null; + + rowGroupOpen = false; + } + + @Override + public void startRowGroup(StreamSources dataStreamSources) + throws IOException + { + presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); + secondsStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); + nanosStreamSource = dataStreamSources.getStreamSource(streamDescriptor, SECONDARY, LongStream.class); + + readOffset = 0; + nextBatchSize = 0; + + presentStream = null; + secondsStream = null; + nanosStream = null; + + rowGroupOpen = false; + } + + @Override + public String toString() + { + return toStringHelper(this) + .addValue(streamDescriptor) + .toString(); + } + + // This comes from the Apache Hive ORC code + public static long decodeTimestamp(long seconds, long serializedNanos, long baseTimestampInSeconds) + { + long millis = (seconds + baseTimestampInSeconds) * MILLIS_PER_SECOND; + long nanos = parseNanos(serializedNanos); + + // the rounding error exists because java always rounds up when dividing integers + // -42001/1000 = -42; and -42001 % 1000 = -1 (+ 1000) + // to get the correct value we need + // (-42 - 1)*1000 + 999 = -42001 + // (42)*1000 + 1 = 42001 + if (millis < 0 && nanos != 0) { + millis -= 1000; + } + // Truncate nanos to millis and add to mills + return millis + (nanos / 1000000); + } + + // This comes from the Apache Hive ORC code + private static int parseNanos(long serialized) + { + int zeros = ((int) serialized) & 0x7; // 0b111 + int result = (int) (serialized >>> 3); + if (zeros != 0) { + for (int i = 0; i <= zeros; ++i) { + result *= 10; + } + } + return result; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java new file mode 100644 index 0000000000..853609af56 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java @@ -0,0 +1,67 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.ByteArrayStreamCheckpoint; + +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; + +public class ByteArrayStream + implements ValueStream +{ + private final OrcInputStream inputStream; + + public ByteArrayStream(OrcInputStream inputStream) + { + this.inputStream = checkNotNull(inputStream, "inputStream is null"); + } + + public byte[] next(int length) + throws IOException + { + byte[] data = new byte[length]; + readFully(inputStream, data, 0, length); + return data; + } + + public void next(int length, byte[] data) + throws IOException + { + readFully(inputStream, data, 0, length); + } + + @Override + public Class getCheckpointType() + { + return ByteArrayStreamCheckpoint.class; + } + + @Override + public void seekToCheckpoint(ByteArrayStreamCheckpoint checkpoint) + throws IOException + { + inputStream.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); + } + + @Override + public void skip(int skipSize) + throws IOException + { + skipFully(inputStream, skipSize); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java new file mode 100644 index 0000000000..adb27cbeb9 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java @@ -0,0 +1,134 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.checkpoint.ByteStreamCheckpoint; + +import java.io.IOException; +import java.util.Arrays; + +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; + +public class ByteStream + implements ValueStream +{ + private final OrcInputStream input; + private final byte[] buffer = new byte[MIN_REPEAT_SIZE + 127]; + private int length; + private int offset; + private long lastReadInputCheckpoint; + + public ByteStream(OrcInputStream input) + { + this.input = input; + lastReadInputCheckpoint = input.getCheckpoint(); + } + + // This is based on the Apache Hive ORC code + private void readNextBlock() + throws IOException + { + lastReadInputCheckpoint = input.getCheckpoint(); + + int control = input.read(); + verifyFormat(control != -1, "Read past end of buffer RLE byte from %s", input); + + offset = 0; + + // if byte high bit is not set, this is a repetition; otherwise it is a literal sequence + if ((control & 0x80) == 0) { + length = control + MIN_REPEAT_SIZE; + + // read the repeated value + int value = input.read(); + verifyFormat(value != -1, "Reading RLE byte got EOF"); + + // fill buffer with the value + Arrays.fill(buffer, 0, length, (byte) value); + } + else { + // length is 2's complement of byte + length = 0x100 - control; + + // read the literals into the buffer + readFully(input, buffer, 0, length); + } + } + + @Override + public Class getCheckpointType() + { + return ByteStreamCheckpoint.class; + } + + @Override + public void seekToCheckpoint(ByteStreamCheckpoint checkpoint) + throws IOException + { + // if the checkpoint is within the current buffer, just adjust the pointer + if (lastReadInputCheckpoint == checkpoint.getInputStreamCheckpoint() && checkpoint.getOffset() <= length) { + offset = checkpoint.getOffset(); + } + else { + // otherwise, discard the buffer and start over + input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); + length = 0; + offset = 0; + skip(checkpoint.getOffset()); + } + } + + @Override + public void skip(int items) + throws IOException + { + while (items > 0) { + if (offset == length) { + readNextBlock(); + } + long consume = Math.min(items, length - offset); + offset += consume; + items -= consume; + } + } + + public byte next() + throws IOException + { + if (offset == length) { + readNextBlock(); + } + return buffer[offset++]; + } + + public void nextVector(long items, long[] vector) + throws IOException + { + for (int i = 0; i < items; i++) { + vector[i] = next(); + } + } + + public void nextVector(long items, long[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = next(); + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java new file mode 100644 index 0000000000..6c3e5ea6c9 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java @@ -0,0 +1,69 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; + +import javax.annotation.Nullable; +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkNotNull; + +public class CheckpointStreamSource, C extends StreamCheckpoint> + implements StreamSource +{ + public static , C extends StreamCheckpoint> CheckpointStreamSource createCheckpointStreamSource(S stream, StreamCheckpoint checkpoint) + { + checkNotNull(stream, "stream is null"); + checkNotNull(checkpoint, "checkpoint is null"); + + Class checkpointType = stream.getCheckpointType(); + C verifiedCheckpoint = OrcStreamUtils.checkType(checkpoint, checkpointType, "Checkpoint"); + return new CheckpointStreamSource(stream, verifiedCheckpoint); + } + + private final S stream; + private final C checkpoint; + + public CheckpointStreamSource(S stream, C checkpoint) + { + this.stream = checkNotNull(stream, "stream is null"); + this.checkpoint = checkNotNull(checkpoint, "checkpoint is null"); + } + + @Override + public Class getStreamType() + { + return (Class) stream.getClass(); + } + + @Nullable + @Override + public S openStream() + throws IOException + { + stream.seekToCheckpoint(checkpoint); + return stream; + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("stream", stream) + .add("checkpoint", checkpoint) + .toString(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java new file mode 100644 index 0000000000..08f1f160e2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java @@ -0,0 +1,104 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.DoubleStreamCheckpoint; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; + +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkPositionIndex; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; +import static io.airlift.slice.SizeOf.SIZE_OF_DOUBLE; + +public class DoubleStream + implements ValueStream +{ + private final OrcInputStream input; + private final byte[] buffer = new byte[Vector.MAX_VECTOR_LENGTH * SIZE_OF_DOUBLE]; + private final Slice slice = Slices.wrappedBuffer(buffer); + + public DoubleStream(OrcInputStream input) + { + this.input = input; + } + + @Override + public Class getCheckpointType() + { + return DoubleStreamCheckpoint.class; + } + + @Override + public void seekToCheckpoint(DoubleStreamCheckpoint checkpoint) + throws IOException + { + input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); + } + + @Override + public void skip(int items) + throws IOException + { + long length = items * SIZE_OF_DOUBLE; + skipFully(input, length); + } + + public double next() + throws IOException + { + readFully(input, buffer, 0, SIZE_OF_DOUBLE); + return slice.getDouble(0); + } + + public void nextVector(int items, double[] vector) + throws IOException + { + checkPositionIndex(items, vector.length); + checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); + + // buffer that number of values + readFully(input, buffer, 0, items * SIZE_OF_DOUBLE); + + // copy values directly into vector + Slices.wrappedDoubleArray(vector).setBytes(0, slice, 0, items * SIZE_OF_DOUBLE); + } + + public void nextVector(long items, double[] vector, boolean[] isNull) + throws IOException + { + // count the number of non nulls + int notNullCount = 0; + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + notNullCount++; + } + } + + // buffer that umber of values + readFully(input, buffer, 0, notNullCount * SIZE_OF_DOUBLE); + + // load them into the buffer + int elementIndex = 0; + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = slice.getDouble(elementIndex); + elementIndex += SIZE_OF_DOUBLE; + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java new file mode 100644 index 0000000000..722c9470fd --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java @@ -0,0 +1,109 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.FloatStreamCheckpoint; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; + +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkPositionIndex; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; +import static io.airlift.slice.SizeOf.SIZE_OF_FLOAT; + +public class FloatStream + implements ValueStream +{ + private final OrcInputStream input; + private final byte[] buffer = new byte[Vector.MAX_VECTOR_LENGTH * SIZE_OF_FLOAT]; + private final Slice slice = Slices.wrappedBuffer(buffer); + + public FloatStream(OrcInputStream input) + { + this.input = input; + } + + @Override + public Class getCheckpointType() + { + return FloatStreamCheckpoint.class; + } + + @Override + public void seekToCheckpoint(FloatStreamCheckpoint checkpoint) + throws IOException + { + input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); + } + + @Override + public void skip(int items) + throws IOException + { + long length = items * SIZE_OF_FLOAT; + skipFully(input, length); + } + + public float next() + throws IOException + { + readFully(input, buffer, 0, SIZE_OF_FLOAT); + return slice.getFloat(0); + } + + public void nextVector(int items, double[] vector) + throws IOException + { + checkPositionIndex(items, vector.length); + checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); + + // buffer that number of values + readFully(input, buffer, 0, items * SIZE_OF_FLOAT); + + // load them into the buffer one at a time since we are reading + // floats into a double vector + int elementIndex = 0; + for (int i = 0; i < items; i++) { + vector[i] = slice.getFloat(elementIndex); + elementIndex += SIZE_OF_FLOAT; + } + } + + public void nextVector(long items, double[] vector, boolean[] isNull) + throws IOException + { + // count the number of non nulls + int notNullCount = 0; + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + notNullCount++; + } + } + + // buffer that umber of values + readFully(input, buffer, 0, notNullCount * SIZE_OF_FLOAT); + + // load them into the buffer + int elementIndex = 0; + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = slice.getFloat(elementIndex); + elementIndex += SIZE_OF_FLOAT; + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java new file mode 100644 index 0000000000..40753bfe75 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java @@ -0,0 +1,177 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; + +import java.io.IOException; +import java.io.InputStream; + +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.FixedBitSizes.*; +import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.FixedBitSizes.*; + +// This is based on the Apache Hive ORC code +public final class LongDecode +{ + private LongDecode() + { + } + + enum FixedBitSizes + { + ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, + THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, + TWENTY, TWENTY_ONE, TWENTY_TWO, TWENTY_THREE, TWENTY_FOUR, TWENTY_SIX, + TWENTY_EIGHT, THIRTY, THIRTY_TWO, FORTY, FORTY_EIGHT, FIFTY_SIX, SIXTY_FOUR; + } + + /** + * Decodes the ordinal fixed bit value to actual fixed bit width value. + */ + public static int decodeBitWidth(int n) + { + if (n >= ONE.ordinal() && n <= TWENTY_FOUR.ordinal()) { + return n + 1; + } + else if (n == TWENTY_SIX.ordinal()) { + return 26; + } + else if (n == TWENTY_EIGHT.ordinal()) { + return 28; + } + else if (n == THIRTY.ordinal()) { + return 30; + } + else if (n == THIRTY_TWO.ordinal()) { + return 32; + } + else if (n == FORTY.ordinal()) { + return 40; + } + else if (n == FORTY_EIGHT.ordinal()) { + return 48; + } + else if (n == FIFTY_SIX.ordinal()) { + return 56; + } + else { + return 64; + } + } + + /** + * Gets the closest supported fixed bit width for the specified bit width. + */ + public static int getClosestFixedBits(int width) + { + if (width == 0) { + return 1; + } + + if (width >= 1 && width <= 24) { + return width; + } + else if (width > 24 && width <= 26) { + return 26; + } + else if (width > 26 && width <= 28) { + return 28; + } + else if (width > 28 && width <= 30) { + return 30; + } + else if (width > 30 && width <= 32) { + return 32; + } + else if (width > 32 && width <= 40) { + return 40; + } + else if (width > 40 && width <= 48) { + return 48; + } + else if (width > 48 && width <= 56) { + return 56; + } + else { + return 64; + } + } + + public static long readSignedVInt(InputStream inputStream) + throws IOException + { + long result = readUnsignedVInt(inputStream); + return (result >>> 1) ^ -(result & 1); + } + + public static long readUnsignedVInt(InputStream inputStream) + throws IOException + { + long result = 0; + int offset = 0; + long b; + do { + b = inputStream.read(); + verifyFormat(b != -1, "EOF while reading unsigned vint"); + result |= (b & 0x7F /* 0b0111_1111 */) << offset; + offset += 7; + } while ((b & 0x80 /* 0b1000_0000 */) != 0); + return result; + } + + public static long readVInt(boolean signed, InputStream inputStream) + throws IOException + { + if (signed) { + return readSignedVInt(inputStream); + } + else { + return readUnsignedVInt(inputStream); + } + } + + public static long zigzagDecode(long value) + { + return (value >>> 1) ^ -(value & 1); + } + + public static long readDwrfLong(InputStream input, OrcTypeKind type, boolean signed, boolean usesVInt) + throws IOException + { + if (usesVInt) { + return readVInt(signed, input); + } + else if (type == SHORT) { + return input.read() | (input.read() << 8); + } + else if (type == INT) { + return input.read() | (input.read() << 8) | (input.read() << 16) | (input.read() << 24); + } + else if (type == LONG) { + return ((long) input.read()) | + (((long) input.read()) << 8) | + (((long) input.read()) << 16) | + (((long) input.read()) << 24) | + (((long) input.read()) << 32) | + (((long) input.read()) << 40) | + (((long) input.read()) << 48) | + (((long) input.read()) << 56); + } + else { + throw new IllegalArgumentException(type + " type is not supported"); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java new file mode 100644 index 0000000000..e037be6c3e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java @@ -0,0 +1,129 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.Vector; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamDwrfCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; + +import java.io.IOException; + +import static com.google.common.base.Preconditions.checkPositionIndex; +import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.readDwrfLong; + +public class LongStreamDwrf + implements LongStream +{ + private final OrcInputStream input; + private final OrcTypeKind orcTypeKind; + private final boolean signed; + private final boolean usesVInt; + + public LongStreamDwrf(OrcInputStream input, OrcTypeKind type, boolean signed, boolean usesVInt) + { + this.input = input; + this.orcTypeKind = type; + this.signed = signed; + this.usesVInt = usesVInt; + } + + @Override + public Class getCheckpointType() + { + return LongStreamDwrfCheckpoint.class; + } + + @Override + public void seekToCheckpoint(LongStreamCheckpoint checkpoint) + throws IOException + { + LongStreamDwrfCheckpoint dwrfCheckpoint = OrcStreamUtils.checkType(checkpoint, LongStreamDwrfCheckpoint.class, "Checkpoint"); + input.seekToCheckpoint(dwrfCheckpoint.getInputStreamCheckpoint()); + } + + @Override + public void skip(int items) + throws IOException + { + // there is no fast way to skip values + for (int i = 0; i < items; i++) { + next(); + } + } + + @Override + public long sum(int items) + throws IOException + { + long sum = 0; + for (int i = 0; i < items; i++) { + sum += next(); + } + return sum; + } + + @Override + public long next() + throws IOException + { + return readDwrfLong(input, orcTypeKind, signed, usesVInt); + } + + @Override + public void nextIntVector(int items, int[] vector) + throws IOException + { + checkPositionIndex(items, vector.length); + checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); + + for (int i = 0; i < items; i++) { + vector[i] = Ints.checkedCast(next()); + } + } + + @Override + public void nextIntVector(int items, int[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = Ints.checkedCast(next()); + } + } + } + + @Override + public void nextLongVector(int items, long[] vector) + throws IOException + { + checkPositionIndex(items, vector.length); + + for (int i = 0; i < items; i++) { + vector[i] = next(); + } + } + + @Override + public void nextLongVector(int items, long[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = next(); + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java new file mode 100644 index 0000000000..29a6d25ef6 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java @@ -0,0 +1,184 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamV1Checkpoint; + +import java.io.IOException; + +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; + +public class LongStreamV1 + implements LongStream +{ + private static final int MAX_LITERAL_SIZE = 128; + + private final OrcInputStream input; + private final boolean signed; + private final long[] literals = new long[MAX_LITERAL_SIZE]; + private int numLiterals; + private int delta; + private int used; + private boolean repeat; + private long lastReadInputCheckpoint; + + public LongStreamV1(OrcInputStream input, boolean signed) + { + this.input = input; + this.signed = signed; + lastReadInputCheckpoint = input.getCheckpoint(); + } + + // This comes from the Apache Hive ORC code + private void readValues() + throws IOException + { + lastReadInputCheckpoint = input.getCheckpoint(); + + int control = input.read(); + verifyFormat(control != -1, "Read past end of RLE integer from %s", input); + + if (control < 0x80) { + numLiterals = control + MIN_REPEAT_SIZE; + used = 0; + repeat = true; + delta = input.read(); + verifyFormat(delta != -1, "End of stream in RLE Integer from %s", input); + + // convert from 0 to 255 to -128 to 127 by converting to a signed byte + // noinspection SillyAssignment + delta = (byte) delta; + literals[0] = LongDecode.readVInt(signed, input); + } + else { + numLiterals = 0x100 - control; + used = 0; + repeat = false; + for (int i = 0; i < numLiterals; ++i) { + literals[i] = LongDecode.readVInt(signed, input); + } + } + } + + @Override + // This comes from the Apache Hive ORC code + public long next() + throws IOException + { + long result; + if (used == numLiterals) { + readValues(); + } + if (repeat) { + result = literals[0] + (used++) * delta; + } + else { + result = literals[used++]; + } + return result; + } + + @Override + public Class getCheckpointType() + { + return LongStreamV1Checkpoint.class; + } + + @Override + public void seekToCheckpoint(LongStreamCheckpoint checkpoint) + throws IOException + { + LongStreamV1Checkpoint v1Checkpoint = OrcStreamUtils.checkType(checkpoint, LongStreamV1Checkpoint.class, "Checkpoint"); + + // if the checkpoint is within the current buffer, just adjust the pointer + if (lastReadInputCheckpoint == v1Checkpoint.getInputStreamCheckpoint() && v1Checkpoint.getOffset() <= numLiterals) { + used = v1Checkpoint.getOffset(); + } + else { + // otherwise, discard the buffer and start over + input.seekToCheckpoint(v1Checkpoint.getInputStreamCheckpoint()); + numLiterals = 0; + used = 0; + skip(v1Checkpoint.getOffset()); + } + } + + @Override + public void skip(int items) + throws IOException + { + while (items > 0) { + if (used == numLiterals) { + readValues(); + } + long consume = Math.min(items, numLiterals - used); + used += consume; + items -= consume; + } + } + + @Override + public long sum(int items) + throws IOException + { + long sum = 0; + for (int i = 0; i < items; i++) { + sum += next(); + } + return sum; + } + + @Override + public void nextLongVector(int items, long[] vector) + throws IOException + { + for (int i = 0; i < items; i++) { + vector[i] = next(); + } + } + + @Override + public void nextLongVector(int items, long[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = next(); + } + } + } + + @Override + public void nextIntVector(int items, int[] vector) + throws IOException + { + for (int i = 0; i < items; i++) { + vector[i] = Ints.checkedCast(next()); + } + } + + @Override + public void nextIntVector(int items, int[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = Ints.checkedCast(next()); + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java new file mode 100644 index 0000000000..f22b3681d2 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java @@ -0,0 +1,452 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import com.google.common.primitives.Ints; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; +import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamV2Checkpoint; + +import java.io.IOException; +import java.io.InputStream; + +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; + +/** + * @see {@link org.apache.hadoop.hive.ql.io.orc.RunLengthIntegerWriterV2} for description of various lightweight compression techniques. + */ +// This comes from the Apache Hive ORC code +public class LongStreamV2 + implements LongStream +{ + private static final int MAX_LITERAL_SIZE = 512; + + private enum EncodingType + { + SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA + } + + private final OrcInputStream input; + private final boolean signed; + private final long[] literals = new long[MAX_LITERAL_SIZE]; + private int numLiterals; + private int used; + private final boolean skipCorrupt; + private long lastReadInputCheckpoint; + + public LongStreamV2(OrcInputStream input, boolean signed, boolean skipCorrupt) + { + this.input = input; + this.signed = signed; + this.skipCorrupt = skipCorrupt; + lastReadInputCheckpoint = input.getCheckpoint(); + } + + // This comes from the Apache Hive ORC code + private void readValues() + throws IOException + { + lastReadInputCheckpoint = input.getCheckpoint(); + + // read the first 2 bits and determine the encoding type + int firstByte = input.read(); + verifyFormat(firstByte >= 0, "Read past end of RLE integer from %s", input); + + int enc = (firstByte >>> 6) & 0x03; + if (EncodingType.SHORT_REPEAT.ordinal() == enc) { + readShortRepeatValues(firstByte); + } + else if (EncodingType.DIRECT.ordinal() == enc) { + readDirectValues(firstByte); + } + else if (EncodingType.PATCHED_BASE.ordinal() == enc) { + readPatchedBaseValues(firstByte); + } + else { + readDeltaValues(firstByte); + } + } + + // This comes from the Apache Hive ORC code + private void readDeltaValues(int firstByte) + throws IOException + { + // extract the number of fixed bits + int fixedBits = (firstByte >>> 1) & 0x1f; + if (fixedBits != 0) { + fixedBits = LongDecode.decodeBitWidth(fixedBits); + } + + // extract the blob run length + int length = (firstByte & 0x01) << 8; + length |= input.read(); + + // read the first value stored as vint + long firstVal = LongDecode.readVInt(signed, input); + + // store first value to result buffer + literals[numLiterals++] = firstVal; + + // if fixed bits is 0 then all values have fixed delta + long prevVal; + if (fixedBits == 0) { + // read the fixed delta value stored as vint (deltas can be negative even + // if all number are positive) + long fixedDelta = LongDecode.readSignedVInt(input); + + // add fixed deltas to adjacent values + for (int i = 0; i < length; i++) { + literals[numLiterals++] = literals[numLiterals - 2] + fixedDelta; + } + } + else { + long deltaBase = LongDecode.readSignedVInt(input); + // add delta base and first value + literals[numLiterals++] = firstVal + deltaBase; + prevVal = literals[numLiterals - 1]; + length -= 1; + + // write the unpacked values, add it to previous value and store final + // value to result buffer. if the delta base value is negative then it + // is a decreasing sequence else an increasing sequence + readBitPackedLongs(literals, numLiterals, length, fixedBits, input); + while (length > 0) { + if (deltaBase < 0) { + literals[numLiterals] = prevVal - literals[numLiterals]; + } + else { + literals[numLiterals] = prevVal + literals[numLiterals]; + } + prevVal = literals[numLiterals]; + length--; + numLiterals++; + } + } + } + + // This comes from the Apache Hive ORC code + private void readPatchedBaseValues(int firstByte) + throws IOException + { + // extract the number of fixed bits + int fb = LongDecode.decodeBitWidth((firstByte >>> 1) & 0x1F); // 0b1_1111 + + // extract the run length of data blob + int length = (firstByte & 0x01) << 8; + length |= input.read(); + // runs are always one off + length += 1; + + // extract the number of bytes occupied by base + int thirdByte = input.read(); + int baseWidth = (thirdByte >>> 5) & 0x07; // 0b0111 + // base width is one off + baseWidth += 1; + + // extract patch width + int patchWidth = LongDecode.decodeBitWidth(thirdByte & 0x1F); // 0b1_1111 + + // read fourth byte and extract patch gap width + int fourthByte = input.read(); + int patchGapWidth = (fourthByte >>> 5) & 0x07; // 0b0111 + // patch gap width is one off + patchGapWidth += 1; + + // extract the length of the patch list + int patchListLength = fourthByte & 0x1F; // 0b1_1111 + + // read the next base width number of bytes to extract base value + long base = bytesToLongBE(input, baseWidth); + long mask = (1L << ((baseWidth * 8) - 1)); + // if MSB of base value is 1 then base is negative value else positive + if ((base & mask) != 0) { + base = base & ~mask; + base = -base; + } + + // unpack the data blob + long[] unpacked = new long[length]; + readBitPackedLongs(unpacked, 0, length, fb, input); + + // unpack the patch blob + long[] unpackedPatch = new long[patchListLength]; + + verifyFormat((patchWidth + patchGapWidth) <= 64 || skipCorrupt, "ORC file is corrupt"); + + int bitSize = LongDecode.getClosestFixedBits(patchWidth + patchGapWidth); + readBitPackedLongs(unpackedPatch, 0, patchListLength, bitSize, input); + + // apply the patch directly when decoding the packed data + int patchIndex = 0; + long currentGap; + long currentPatch; + long patchMask = ((1L << patchWidth) - 1); + currentGap = unpackedPatch[patchIndex] >>> patchWidth; + currentPatch = unpackedPatch[patchIndex] & patchMask; + long actualGap = 0; + + // special case: gap is >255 then patch value will be 0. + // if gap is <=255 then patch value cannot be 0 + while (currentGap == 255 && currentPatch == 0) { + actualGap += 255; + patchIndex++; + currentGap = unpackedPatch[patchIndex] >>> patchWidth; + currentPatch = unpackedPatch[patchIndex] & patchMask; + } + // add the left over gap + actualGap += currentGap; + + // unpack data blob, patch it (if required), add base to get final result + for (int i = 0; i < unpacked.length; i++) { + if (i == actualGap) { + // extract the patch value + long patchedValue = unpacked[i] | (currentPatch << fb); + + // add base to patched value + literals[numLiterals++] = base + patchedValue; + + // increment the patch to point to next entry in patch list + patchIndex++; + + if (patchIndex < patchListLength) { + // read the next gap and patch + currentGap = unpackedPatch[patchIndex] >>> patchWidth; + currentPatch = unpackedPatch[patchIndex] & patchMask; + actualGap = 0; + + // special case: gap is >255 then patch will be 0. if gap is + // <=255 then patch cannot be 0 + while (currentGap == 255 && currentPatch == 0) { + actualGap += 255; + patchIndex++; + currentGap = unpackedPatch[patchIndex] >>> patchWidth; + currentPatch = unpackedPatch[patchIndex] & patchMask; + } + // add the left over gap + actualGap += currentGap; + + // next gap is relative to the current gap + actualGap += i; + } + } + else { + // no patching required. add base to unpacked value to get final value + literals[numLiterals++] = base + unpacked[i]; + } + } + + } + + // This comes from the Apache Hive ORC code + private void readDirectValues(int firstByte) + throws IOException + { + // extract the number of fixed bits + int fixedBits = LongDecode.decodeBitWidth((firstByte >>> 1) & 0x1F); // 0b1_1111 + + // extract the run length + int length = (firstByte & 0x01) << 8; + length |= input.read(); + // runs are one off + length += 1; + + // write the unpacked values and zigzag decode to result buffer + readBitPackedLongs(literals, numLiterals, length, fixedBits, input); + if (signed) { + for (int i = 0; i < length; i++) { + literals[numLiterals] = LongDecode.zigzagDecode(literals[numLiterals]); + numLiterals++; + } + } + else { + numLiterals += length; + } + } + + // This comes from the Apache Hive ORC code + private void readShortRepeatValues(int firstByte) + throws IOException + { + // read the number of bytes occupied by the value + int size = (firstByte >>> 3) & 0x07; // 0b0111 + // #bytes are one off + size += 1; + + // read the run length + int length = firstByte & 0x07; + // run lengths values are stored only after MIN_REPEAT value is met + length += MIN_REPEAT_SIZE; + + // read the repeated value which is store using fixed bytes + long val = bytesToLongBE(input, size); + + if (signed) { + val = LongDecode.zigzagDecode(val); + } + + // repeat the value for length times + for (int i = 0; i < length; i++) { + literals[numLiterals++] = val; + } + } + + // This comes from the Apache Hive ORC code + private static void readBitPackedLongs(long[] buffer, int offset, int len, int bitSize, InputStream input) + throws IOException + { + int bitsLeft = 0; + int current = 0; + + for (int i = offset; i < (offset + len); i++) { + long result = 0; + int bitsLeftToRead = bitSize; + while (bitsLeftToRead > bitsLeft) { + result <<= bitsLeft; + result |= current & ((1 << bitsLeft) - 1); + bitsLeftToRead -= bitsLeft; + current = input.read(); + bitsLeft = 8; + } + + // handle the left over bits + if (bitsLeftToRead > 0) { + result <<= bitsLeftToRead; + bitsLeft -= bitsLeftToRead; + result |= (current >> bitsLeft) & ((1 << bitsLeftToRead) - 1); + } + buffer[i] = result; + } + } + + /** + * Read n bytes in big endian order and convert to long. + */ + private static long bytesToLongBE(InputStream input, int n) + throws IOException + { + long out = 0; + long val; + while (n > 0) { + n--; + // store it in a long and then shift else integer overflow will occur + val = input.read(); + out |= (val << (n * 8)); + } + return out; + } + + @Override + public long next() + throws IOException + { + if (used == numLiterals) { + numLiterals = 0; + used = 0; + readValues(); + } + return literals[used++]; + } + + @Override + public Class getCheckpointType() + { + return LongStreamV2Checkpoint.class; + } + + @Override + public void seekToCheckpoint(LongStreamCheckpoint checkpoint) + throws IOException + { + LongStreamV2Checkpoint v2Checkpoint = OrcStreamUtils.checkType(checkpoint, LongStreamV2Checkpoint.class, "Checkpoint"); + + // if the checkpoint is within the current buffer, just adjust the pointer + if (lastReadInputCheckpoint == v2Checkpoint.getInputStreamCheckpoint() && v2Checkpoint.getOffset() <= numLiterals) { + used = v2Checkpoint.getOffset(); + } + else { + // otherwise, discard the buffer and start over + input.seekToCheckpoint(v2Checkpoint.getInputStreamCheckpoint()); + numLiterals = 0; + used = 0; + skip(v2Checkpoint.getOffset()); + } + } + + @Override + public void skip(int items) + throws IOException + { + while (items > 0) { + if (used == numLiterals) { + numLiterals = 0; + used = 0; + readValues(); + } + long consume = Math.min(items, numLiterals - used); + used += consume; + items -= consume; + } + } + + @Override + public long sum(int items) + throws IOException + { + long sum = 0; + for (int i = 0; i < items; i++) { + sum += next(); + } + return sum; + } + + @Override + public void nextLongVector(int items, long[] vector) + throws IOException + { + for (int i = 0; i < items; i++) { + vector[i] = next(); + } + } + + @Override + public void nextLongVector(int items, long[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = next(); + } + } + } + + @Override + public void nextIntVector(int items, int[] vector) + throws IOException + { + for (int i = 0; i < items; i++) { + vector[i] = Ints.checkedCast(next()); + } + } + + @Override + public void nextIntVector(int items, int[] vector, boolean[] isNull) + throws IOException + { + for (int i = 0; i < items; i++) { + if (!isNull[i]) { + vector[i] = Ints.checkedCast(next()); + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java new file mode 100644 index 0000000000..54472236d8 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java @@ -0,0 +1,274 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import com.google.common.base.MoreObjects; +import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; +import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; +import io.airlift.slice.BasicSliceInput; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.iq80.snappy.Snappy; + +import java.io.IOException; +import java.io.InputStream; +import java.util.zip.DataFormatException; +import java.util.zip.Inflater; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; +import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.*; +import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; +import static io.airlift.slice.Slices.EMPTY_SLICE; +import static sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET; + +public final class OrcInputStream + extends InputStream +{ + public static final int BLOCK_HEADER_SIZE = 3; + + private final String source; + private final BasicSliceInput compressedSliceInput; + private final CompressionKind compressionKind; + private final int bufferSize; + + private int currentCompressedBlockOffset; + private BasicSliceInput current; + + private Slice buffer; + + public OrcInputStream(String source, BasicSliceInput sliceInput, CompressionKind compressionKind, int bufferSize) + { + this.source = checkNotNull(source, "source is null"); + + checkNotNull(sliceInput, "sliceInput is null"); + + this.compressionKind = checkNotNull(compressionKind, "compressionKind is null"); + this.bufferSize = bufferSize; + + if (compressionKind == UNCOMPRESSED) { + this.current = sliceInput; + this.compressedSliceInput = EMPTY_SLICE.getInput(); + } + else { + checkArgument(compressionKind == SNAPPY || compressionKind == ZLIB, "%s compression not supported", compressionKind); + this.compressedSliceInput = checkNotNull(sliceInput, "compressedSliceInput is null"); + this.current = EMPTY_SLICE.getInput(); + } + } + + @Override + public void close() + throws IOException + { + current = null; + } + + @Override + public int available() + throws IOException + { + if (current == null) { + return 0; + } + return current.available(); + } + + @Override + public boolean markSupported() + { + return false; + } + + @Override + public int read() + throws IOException + { + if (current == null) { + return -1; + } + + int result = current.read(); + if (result != -1) { + return result; + } + + advance(); + return read(); + } + + @Override + public int read(byte[] b, int off, int length) + throws IOException + { + if (current == null) { + return -1; + } + + if (!current.isReadable()) { + advance(); + if (current == null) { + return -1; + } + } + + return current.read(b, off, length); + } + + public long getCheckpoint() + { + // if the decompressed buffer is empty, return a checkpoint starting at the next block + if (current == null || (current.position() == 0 && current.available() == 0)) { + return createInputStreamCheckpoint(compressedSliceInput.position(), 0); + } + // otherwise return a checkpoint at the last compressed block read and the current position in the buffer + return createInputStreamCheckpoint(currentCompressedBlockOffset, current.position()); + } + + public boolean seekToCheckpoint(long checkpoint) + throws IOException + { + int compressedBlockOffset = decodeCompressedBlockOffset(checkpoint); + int decompressedOffset = decodeDecompressedOffset(checkpoint); + boolean discardedBuffer; + if (compressedBlockOffset != currentCompressedBlockOffset) { + verifyFormat(compressionKind != UNCOMPRESSED, "Reset stream has a compressed block offset but stream is not compressed"); + compressedSliceInput.setPosition(compressedBlockOffset); + current = EMPTY_SLICE.getInput(); + discardedBuffer = true; + } + else { + discardedBuffer = false; + } + + if (decompressedOffset != current.position()) { + current.setPosition(0); + if (current.available() < decompressedOffset) { + decompressedOffset -= current.available(); + advance(); + } + current.setPosition(decompressedOffset); + } + return discardedBuffer; + } + + @Override + public long skip(long n) + throws IOException + { + if (current == null || n <= 0) { + return -1; + } + + long result = current.skip(n); + if (result != 0) { + return result; + } + if (read() == -1) { + return 0; + } + return 1 + current.skip(n - 1); + } + + // This comes from the Apache Hive ORC code + private void advance() + throws IOException + { + if (compressedSliceInput == null || compressedSliceInput.available() == 0) { + current = null; + return; + } + + // 3 byte header + // NOTE: this must match BLOCK_HEADER_SIZE + currentCompressedBlockOffset = compressedSliceInput.position(); + int b0 = compressedSliceInput.readUnsignedByte(); + int b1 = compressedSliceInput.readUnsignedByte(); + int b2 = compressedSliceInput.readUnsignedByte(); + + boolean isUncompressed = (b0 & 0x01) == 1; + int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >>> 1); + + Slice chunk = compressedSliceInput.readSlice(chunkLength); + + if (isUncompressed) { + current = chunk.getInput(); + } + else { + if (buffer == null) { + buffer = Slices.allocate(bufferSize); + } + + int uncompressedSize; + if (compressionKind == ZLIB) { + uncompressedSize = decompressZip(chunk, buffer); + } + else { + uncompressedSize = decompressSnappy(chunk, buffer); + } + + current = buffer.slice(0, uncompressedSize).getInput(); + } + } + + @Override + public String toString() + { + return MoreObjects.toStringHelper(this) + .add("source", source) + .add("compressedOffset", compressedSliceInput.position()) + .add("uncompressedOffset", current == null ? null : current.position()) + .add("compression", compressionKind) + .toString(); + } + + // This comes from the Apache Hive ORC code + private static int decompressZip(Slice in, Slice buffer) + throws IOException + { + byte[] outArray = (byte[]) buffer.getBase(); + int outOffset = 0; + + byte[] inArray = (byte[]) in.getBase(); + int inOffset = (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET); + int inLength = in.length(); + + Inflater inflater = new Inflater(true); + inflater.setInput(inArray, inOffset, inLength); + while (!(inflater.finished() || inflater.needsDictionary() || inflater.needsInput())) { + try { + int count = inflater.inflate(outArray, outOffset, outArray.length - outOffset); + outOffset += count; + } + catch (DataFormatException e) { + throw new OrcCorruptionException(e, "Invalid compressed stream"); + } + } + inflater.end(); + return outOffset; + } + + private static int decompressSnappy(Slice in, Slice buffer) + throws IOException + { + byte[] outArray = (byte[]) buffer.getBase(); + + byte[] inArray = (byte[]) in.getBase(); + int inOffset = (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET); + int inLength = in.length(); + + return Snappy.uncompress(inArray, inOffset, inLength, outArray, 0); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java new file mode 100644 index 0000000000..2f04155d6c --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java @@ -0,0 +1,61 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import java.io.IOException; +import java.io.InputStream; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; + +final class OrcStreamUtils +{ + public static final int MIN_REPEAT_SIZE = 3; + + private OrcStreamUtils() + { + } + + public static void skipFully(InputStream input, long length) + throws IOException + { + while (length > 0) { + long result = input.skip(length); + verifyFormat(result >= 0, "Unexpected end of stream"); + length -= result; + } + } + + public static void readFully(InputStream input, byte[] buffer, int offset, int length) + throws IOException + { + while (offset < length) { + int result = input.read(buffer, offset, length - offset); + verifyFormat(result >= 0, "Unexpected end of stream"); + offset += result; + } + } + + static B checkType(A value, Class target, String name) + { + checkNotNull(value, "%s is null", name); + checkArgument(target.isInstance(value), + "%s must be of type %s, not %s", + name, + target.getName(), + value.getClass().getName()); + return target.cast(value); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java new file mode 100644 index 0000000000..e03dbbbae1 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc.stream; + +import com.google.common.collect.ImmutableMap; +import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; +import org.apache.tajo.storage.thirdparty.orc.StreamId; +import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; + +import javax.annotation.Nonnull; +import java.util.Map; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; + +public class StreamSources +{ + private final Map> streamSources; + + public StreamSources(Map> streamSources) + { + this.streamSources = ImmutableMap.copyOf(checkNotNull(streamSources, "streamSources is null")); + } + + @Nonnull + public > StreamSource getStreamSource(StreamDescriptor streamDescriptor, StreamKind streamKind, Class streamType) + { + checkNotNull(streamDescriptor, "streamDescriptor is null"); + checkNotNull(streamType, "streamType is null"); + + StreamSource streamSource = streamSources.get(new StreamId(streamDescriptor.getStreamId(), streamKind)); + if (streamSource == null) { + streamSource = missingStreamSource(streamType); + } + + checkArgument(streamType.isAssignableFrom(streamSource.getStreamType()), + "%s must be of type %s, not %s", + streamDescriptor, + streamType.getName(), + streamSource.getStreamType().getName()); + + return (StreamSource) streamSource; + } +} From cc49b96eb584faacfa36314128e483377c49500d Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 20 May 2015 12:04:11 +0900 Subject: [PATCH 067/141] HdfsOrcDataSource constructor is changed to receive double instead of DataSize --- .../tajo/storage/thirdparty/orc/HdfsOrcDataSource.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java index a373c27581..16414d2016 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java @@ -40,14 +40,15 @@ public class HdfsOrcDataSource private final DataSize maxMergeDistance; private long readTimeNanos; - public HdfsOrcDataSource(String path, FSDataInputStream inputStream, long size, DataSize maxMergeDistance) + public HdfsOrcDataSource(String path, FSDataInputStream inputStream, long size, double maxMergeDistance) { this.path = checkNotNull(path, "path is null"); this.inputStream = checkNotNull(inputStream, "inputStream is null"); this.size = size; checkArgument(size >= 0, "size is negative"); - this.maxMergeDistance = checkNotNull(maxMergeDistance, "maxMergeDistance is null"); + DataSize mergeDistance = new DataSize(maxMergeDistance, DataSize.Unit.BYTE); + this.maxMergeDistance = checkNotNull(mergeDistance, "maxMergeDistance is null"); } @Override From b309bc89f25e189358640800971bda9d3bd0345f Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 20 May 2015 14:34:34 +0900 Subject: [PATCH 068/141] Initial OrcScanner --- .../apache/tajo/storage/orc/OrcScanner.java | 237 ++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java new file mode 100644 index 0000000000..e161b1dc2e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -0,0 +1,237 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.common.TajoDataTypes; +import org.apache.tajo.conf.TajoConf; +import org.apache.tajo.datum.*; +import org.apache.tajo.exception.UnsupportedException; +import org.apache.tajo.storage.FileScanner; +import org.apache.tajo.storage.Tuple; +import org.apache.tajo.storage.VTuple; +import org.apache.tajo.storage.fragment.Fragment; +import org.apache.tajo.storage.thirdparty.orc.*; +import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnStatistics; +import org.apache.tajo.storage.thirdparty.orc.metadata.OrcMetadataReader; +import org.joda.time.DateTimeZone; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +public class OrcScanner extends FileScanner { + private OrcRecordReader recordReader; + private Vector [] vectors; + private int currentPosInBatch = 0; + private int batchSize = 0; + + public OrcScanner(Configuration conf, final Schema schema, final TableMeta meta, final Fragment fragment) { + super(conf, schema, meta, fragment); + } + + private Vector createOrcVector(TajoDataTypes.Type type) { + switch (type) { + case INT1: case INT2: case INT4: case INT8: + case UINT1: case UINT2: case UINT4: case UINT8: + return new LongVector(); + + case FLOAT4: + case FLOAT8: + return new DoubleVector(); + + case BOOLEAN: + return new BooleanVector(); + + case BLOB: + case TEXT: + return new SliceVector(); + + default: + throw new UnsupportedException("This data type is not supported currently: "+type.toString()); + } + } + + + private FileSystem fs; + private FSDataInputStream fis; + + @Override + public void init() throws IOException { + OrcReader orcReader; + + if (targets == null) { + targets = schema.toArray(); + } + + super.init(); + + Path path = fragment.getPath(); + + // FileFragment information + if(fs == null) { + fs = FileScanner.getFileSystem((TajoConf)conf, path); + } + if(fis == null) fis = fs.open(path); + + OrcDataSource orcDataSource = new HdfsOrcDataSource( + this.fragment.getPath().toString(), + fis, + fs.getFileStatus(path).getLen(), + 200000000); + + for (int i=0; i columnSet = new HashSet(); + for (int i=0; i statisticsByColumnIndex) { + return true; + } + }, + 0, 1024, DateTimeZone.getDefault()); + + getNextBatch(); + } + + @Override + public Tuple next() throws IOException { + if (currentPosInBatch == batchSize) { + getNextBatch(); + + // EOF + if (batchSize == -1) { + return null; + } + } + + int columnSize = schema.size(); + Tuple tuple = new VTuple(columnSize); + + for (int i=0; i Date: Wed, 20 May 2015 15:05:52 +0900 Subject: [PATCH 069/141] Close code error fixed --- .../src/main/java/org/apache/tajo/storage/orc/OrcScanner.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index e161b1dc2e..e8a7dbf0eb 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -217,7 +217,9 @@ public void reset() throws IOException { @Override public void close() throws IOException { - recordReader.close(); + if (recordReader != null) { + recordReader.close(); + } } @Override From 3c9b6c51976b277e86a7e05a4e8f540bac049239 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 20 May 2015 15:57:53 +0900 Subject: [PATCH 070/141] Creating vectors missed --- .../src/main/java/org/apache/tajo/storage/orc/OrcScanner.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index e8a7dbf0eb..9a521122e0 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -74,7 +74,6 @@ private Vector createOrcVector(TajoDataTypes.Type type) { } } - private FileSystem fs; private FSDataInputStream fis; @@ -102,6 +101,7 @@ public void init() throws IOException { fs.getFileStatus(path).getLen(), 200000000); + vectors = new Vector[schema.size()]; for (int i=0; i Date: Wed, 20 May 2015 17:37:51 +0900 Subject: [PATCH 071/141] Add comment --- .../apache/tajo/storage/orc/OrcScanner.java | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 9a521122e0..9b37255279 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -42,6 +42,9 @@ import java.util.Map; import java.util.Set; +/** + * OrcScanner for reading ORC files + */ public class OrcScanner extends FileScanner { private OrcRecordReader recordReader; private Vector [] vectors; @@ -89,23 +92,28 @@ public void init() throws IOException { Path path = fragment.getPath(); - // FileFragment information if(fs == null) { fs = FileScanner.getFileSystem((TajoConf)conf, path); } - if(fis == null) fis = fs.open(path); + if(fis == null) { + fis = fs.open(path); + } + + // TODO: max merge distance should be fetched from conf OrcDataSource orcDataSource = new HdfsOrcDataSource( this.fragment.getPath().toString(), fis, fs.getFileStatus(path).getLen(), 200000000); + // creating vectors for buffering vectors = new Vector[schema.size()]; for (int i=0; i columnSet = new HashSet(); for (int i=0; i statisticsByColumnIndex) { @@ -152,6 +158,7 @@ public Tuple next() throws IOException { return tuple; } + // TODO: support more types private Datum createValueDatum(Vector vector, TajoDataTypes.Type type) { switch (type) { case INT1: @@ -212,7 +219,6 @@ public float getProgress() { @Override public void reset() throws IOException { - } @Override From 83d926739011f2420a84d7543bb5c82b72ce3d6e Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 21 May 2015 16:21:02 +0900 Subject: [PATCH 072/141] FileOrcDataSource constructor modified --- .../apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java index 3d0c42eb89..6b04204668 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java @@ -37,14 +37,14 @@ public class FileOrcDataSource private final DataSize maxMergeDistance; private long readTimeNanos; - public FileOrcDataSource(File path, DataSize maxMergeDistance) + public FileOrcDataSource(File path, double mergeDistance) throws IOException { this.path = checkNotNull(path, "path is null"); this.size = path.length(); this.input = new RandomAccessFile(path, "r"); - this.maxMergeDistance = checkNotNull(maxMergeDistance, "maxMergeDistance is null"); + maxMergeDistance = new DataSize(mergeDistance, DataSize.Unit.BYTE); } @Override From 0bfe20ee11b219583389ccb142c6517fbd2af290 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 21 May 2015 16:21:36 +0900 Subject: [PATCH 073/141] Supporting timestamp --- .../main/java/org/apache/tajo/storage/orc/OrcScanner.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 9b37255279..75add87a73 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -35,6 +35,7 @@ import org.apache.tajo.storage.thirdparty.orc.*; import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnStatistics; import org.apache.tajo.storage.thirdparty.orc.metadata.OrcMetadataReader; +import org.apache.tajo.util.datetime.DateTimeUtil; import org.joda.time.DateTimeZone; import java.io.IOException; @@ -59,6 +60,7 @@ private Vector createOrcVector(TajoDataTypes.Type type) { switch (type) { case INT1: case INT2: case INT4: case INT8: case UINT1: case UINT2: case UINT4: case UINT8: + case TIMESTAMP: return new LongVector(); case FLOAT4: @@ -190,6 +192,9 @@ private Datum createValueDatum(Vector vector, TajoDataTypes.Type type) { case BLOB: return new BlobDatum(((SliceVector)vector).vector[currentPosInBatch].getBytes()); + case TIMESTAMP: + return new TimestampDatum(DateTimeUtil.javaTimeToJulianTime(((LongVector) vector).vector[currentPosInBatch])); + default: throw new UnsupportedException("This data type is not supported currently: "+type.toString()); } From b54c4f8a2df16cefa6d242658e3f9e44d9c3142b Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 21 May 2015 16:36:53 +0900 Subject: [PATCH 074/141] OrcScaner test added --- .../src/test/resources/storage-default.xml | 11 +- .../tajo/storage/orc/TestOrcScanner.java | 107 ++++++++++++++++++ .../src/test/resources/dataset/u_data_20.orc | Bin 0 -> 813 bytes 3 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/u_data_20.orc diff --git a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml index 712f66428f..cbb8115b32 100644 --- a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml @@ -38,7 +38,7 @@ tajo.storage.scanner-handler - csv,raw,rcfile,row,trevni,parquet,sequencefile,avro + csv,raw,rcfile,row,trevni,parquet,orc,sequencefile,avro @@ -66,6 +66,10 @@ tajo.storage.fragment.parquet.class org.apache.tajo.storage.FileFragment + + tajo.storage.fragment.orc.class + org.apache.tajo.storage.FileFragment + tajo.storage.fragment.sequencefile.class org.apache.tajo.storage.fragment.FileFragment @@ -106,6 +110,11 @@ org.apache.tajo.storage.parquet.ParquetScanner + + tajo.storage.scanner-handler.orc.class + org.apache.tajo.storage.orc.OrcScanner + + tajo.storage.scanner-handler.sequencefile.class org.apache.tajo.storage.sequencefile.SequenceFileScanner diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java new file mode 100644 index 0000000000..8b60b9c2c7 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java @@ -0,0 +1,107 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.common.TajoDataTypes; +import org.apache.tajo.conf.TajoConf; +import org.apache.tajo.datum.TimestampDatum; +import org.apache.tajo.storage.Tuple; +import org.apache.tajo.storage.fragment.FileFragment; +import org.apache.tajo.storage.fragment.Fragment; +import org.apache.tajo.util.KeyValueSet; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.*; + +import java.io.IOException; +import java.net.URL; + +public class TestOrcScanner { + private OrcScanner orcScanner; + + public static Path getResourcePath(String path, String suffix) { + URL resultBaseURL = ClassLoader.getSystemResource(path); + return new Path(resultBaseURL.toString(), suffix); + } + + private static FileFragment getFileFragment(Configuration conf, String fileName) throws IOException { + Path tablePath = new Path(getResourcePath("dataset", "."), fileName); + FileSystem fs = FileSystem.getLocal(conf); + FileStatus status = fs.getFileStatus(tablePath); + return new FileFragment("table", tablePath, 0, status.getLen()); + } + + @Before + public void setup() throws IOException { + Schema schema = new Schema(); + schema.addColumn("userid", TajoDataTypes.Type.INT4); + schema.addColumn("movieid", TajoDataTypes.Type.INT4); + schema.addColumn("rating", TajoDataTypes.Type.INT2); + schema.addColumn("unixtimestamp", TajoDataTypes.Type.TEXT); + schema.addColumn("faketime", TajoDataTypes.Type.TIMESTAMP); + + Configuration conf = new TajoConf(); + + TableMeta meta = new TableMeta("ORC", new KeyValueSet()); + + Fragment fragment = getFileFragment(conf, "u_data_20.orc"); + + orcScanner = new OrcScanner(conf, schema, meta, fragment); + + orcScanner.init(); + } + + @Test + public void testReadTuple() { + try { + Tuple tuple = orcScanner.next(); + + assertEquals(tuple.getInt4(0), 196); + assertEquals(tuple.getInt4(1), 242); + assertEquals(tuple.getInt2(2), 3); + assertEquals(tuple.getText(3), "881250949"); + + // Timestamp test + TimestampDatum timestamp = (TimestampDatum)tuple.get(4); + + assertEquals(timestamp.getYear(), 2008); + assertEquals(timestamp.getMonthOfYear(), 12); + assertEquals(timestamp.getDayOfMonth(), 12); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @After + public void end() { + try { + orcScanner.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } +} \ No newline at end of file diff --git a/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/u_data_20.orc b/tajo-storage/tajo-storage-hdfs/src/test/resources/dataset/u_data_20.orc new file mode 100644 index 0000000000000000000000000000000000000000..e6e9c49e0a8747934f03c2220c0ed8cf17e40169 GIT binary patch literal 813 zcmeYdau#G@;9?VE;SdR6&<8R_xtJLk7=-vZM1(jvcmy7?NX)VgU@!#Ah(Todfij#N zRssivBu+Il1~6y?rGz0;JU}Tn4kiIEiK+mGAO?oVmmVagGB7Gu@(HSX>iKaO3v(w6 zdkYII`f+3aal(upj3o>@j4v2{m_ityG5%soV&Gw@VX|SIz_^6*08<>p7seJQH^wUv#ZQ=G zm_9M2F>PVq!X(7Jh`Ehn4HFal8^%2h0Ss1bO3Z9bDjX`D4)P2PZair!8eVB$3mDkk zn3uF1VPFvLJt=s=f#*=vr-M>2_HUdppN(7j>mDN=^Vvr_Cf%qFik+LjH2Sk8XVm0~ zS2sFd>=V?Rd%MwULwNMwMbn#I8$Leo44>h;H1zkI|Bm7xB-pENM}FJzL8c9D7THEfX#9>@-%W;h7 zSYYRwzc!y4xIudMDl>F3FvOlR*cNhgZ&;DKPp5hcUZVke%nf3mb(!@|emIVUTHh3f+Y>kW$;EIU+IHQ3;fO$Syb8Kt-y zna^r|^NOy-CfqyvO?kExlLUhgBg5kau^-2EjvhX!v!N$&(R9y)&dx5L?al5TQzv%L zYz^%)wX?K8&fp~Jq@m1_%CpdC$_$Gc6Ai2dilsUtHWqZS7({$5?BFOY_`qS4G;u~M z_mj`eKj+AVF|oI(_?a3pu~sxN<;eIj#R!Eu%(H3ln4}`Uyme=1>&DKdot*;9TbE93 w+_`k(#+^sF)fO;ePiK6>e;7DcG72zCG&Be(F){FH?OaK4? literal 0 HcmV?d00001 From 533994e37df0dd50af2ae6dd9c8898194a2aa527 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 20 May 2015 14:52:22 +0900 Subject: [PATCH 075/141] Added orc row in storage-default.xml --- .../src/main/resources/storage-default.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml index aa078a7494..6b5143d6b0 100644 --- a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml @@ -120,6 +120,11 @@ org.apache.tajo.storage.parquet.ParquetScanner + + tajo.storage.scanner-handler.orc.class + org.apache.tajo.storage.orc.OrcScanner + + tajo.storage.scanner-handler.sequencefile.class org.apache.tajo.storage.sequencefile.SequenceFileScanner From ccca691af61abd95a153cafdc930cc25f3887fe1 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sun, 29 Mar 2015 16:14:11 +0900 Subject: [PATCH 076/141] TAJO-1463: Add ORCFile store type to create ORCFile table --- .../src/main/java/org/apache/tajo/catalog/CatalogUtil.java | 2 ++ .../tajo-catalog-common/src/main/proto/CatalogProtos.proto | 1 + 2 files changed, 3 insertions(+) diff --git a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java index 45609d0c87..d2b8e67d10 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java +++ b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java @@ -279,6 +279,8 @@ public static StoreType getStoreType(final String typeStr) { return StoreType.ROWFILE; } else if (typeStr.equalsIgnoreCase(StoreType.RCFILE.name())) { return StoreType.RCFILE; + } else if (typeStr.equalsIgnoreCase(StoreType.ORCFILE.name())) { + return StoreType.ORCFILE; } else if (typeStr.equalsIgnoreCase(StoreType.PARQUET.name())) { return StoreType.PARQUET; } else if (typeStr.equalsIgnoreCase(StoreType.SEQUENCEFILE.name())) { diff --git a/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto b/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto index a204685f63..b2b690e4a7 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto +++ b/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto @@ -32,6 +32,7 @@ enum StoreType { RCFILE = 3; ROWFILE = 4; HCFILE = 5; + ORCFILE = 6; PARQUET = 7; SEQUENCEFILE = 8; AVRO = 9; From 2f7103f1e0bdf151c164e31d45a9dee04fe359c6 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 21 May 2015 17:10:12 +0900 Subject: [PATCH 077/141] TimestampDatum comment fixed --- .../src/main/java/org/apache/tajo/datum/TimestampDatum.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java index 9ad5f2b2aa..d1b90d0d17 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TimestampDatum.java @@ -36,7 +36,7 @@ public class TimestampDatum extends Datum { /** * - * @param timestamp UTC based + * @param timestamp UTC based Julian time microseconds */ public TimestampDatum(long timestamp) { super(TajoDataTypes.Type.TIMESTAMP); From 5af2113a751268a81ac45861e24f2ea378c8e773 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 21 May 2015 18:23:59 +0900 Subject: [PATCH 078/141] Imported presto-orc maven jar and removed most of classes from Presto --- tajo-storage/tajo-storage-hdfs/pom.xml | 5 + .../apache/tajo/storage/orc/OrcScanner.java | 9 +- .../storage/thirdparty/orc/BooleanVector.java | 36 -- .../storage/thirdparty/orc/DiskRange.java | 77 --- .../storage/thirdparty/orc/DoubleVector.java | 36 -- .../thirdparty/orc/FileOrcDataSource.java | 11 +- .../thirdparty/orc/HdfsOrcDataSource.java | 13 +- .../storage/thirdparty/orc/LongVector.java | 36 -- .../storage/thirdparty/orc/ObjectVector.java | 29 -- .../orc/OrcCorruptionException.java | 43 -- .../storage/thirdparty/orc/OrcDataSource.java | 37 -- .../thirdparty/orc/OrcDataSourceUtils.java | 82 ---- .../storage/thirdparty/orc/OrcReader.java | 219 --------- .../thirdparty/orc/OrcRecordReader.java | 321 ------------- .../storage/thirdparty/orc/SliceVector.java | 36 -- .../thirdparty/orc/StreamDescriptor.java | 83 ---- .../tajo/storage/thirdparty/orc/StreamId.java | 77 --- .../tajo/storage/thirdparty/orc/Stripe.java | 70 --- .../storage/thirdparty/orc/StripeReader.java | 352 -------------- .../checkpoint/BooleanStreamCheckpoint.java | 58 --- .../checkpoint/ByteArrayStreamCheckpoint.java | 50 -- .../orc/checkpoint/ByteStreamCheckpoint.java | 60 --- .../orc/checkpoint/Checkpoints.java | 405 ---------------- .../checkpoint/DoubleStreamCheckpoint.java | 50 -- .../orc/checkpoint/FloatStreamCheckpoint.java | 50 -- .../orc/checkpoint/InputStreamCheckpoint.java | 64 --- .../checkpoint/LongStreamDwrfCheckpoint.java | 50 -- .../checkpoint/LongStreamV1Checkpoint.java | 60 --- .../checkpoint/LongStreamV2Checkpoint.java | 60 --- ...GroupDictionaryLengthStreamCheckpoint.java | 53 -- .../orc/json/BooleanJsonReader.java | 117 ----- .../thirdparty/orc/json/ByteJsonReader.java | 118 ----- .../thirdparty/orc/json/DateJsonReader.java | 123 ----- .../thirdparty/orc/json/DoubleJsonReader.java | 120 ----- .../thirdparty/orc/json/FloatJsonReader.java | 122 ----- .../thirdparty/orc/json/JsonMapKeyReader.java | 23 - .../thirdparty/orc/json/JsonReader.java | 36 -- .../thirdparty/orc/json/JsonReaders.java | 100 ---- .../thirdparty/orc/json/ListJsonReader.java | 125 ----- .../orc/json/LongDictionaryJsonReader.java | 142 ------ .../orc/json/LongDirectJsonReader.java | 112 ----- .../thirdparty/orc/json/LongJsonReader.java | 99 ---- .../thirdparty/orc/json/MapJsonReader.java | 138 ------ .../orc/json/SliceDictionaryJsonReader.java | 269 ----------- .../orc/json/SliceDirectJsonReader.java | 168 ------- .../thirdparty/orc/json/SliceJsonReader.java | 98 ---- .../thirdparty/orc/json/StructJsonReader.java | 117 ----- .../orc/json/TimestampJsonReader.java | 134 ------ .../orc/metadata/DwrfMetadataReader.java | 367 -------------- .../orc/metadata/OrcMetadataReader.java | 402 ---------------- .../orc/metadata/StringStatistics.java | 39 -- .../orc/reader/BooleanStreamReader.java | 153 ------ .../orc/reader/ByteStreamReader.java | 155 ------ .../orc/reader/DoubleStreamReader.java | 155 ------ .../orc/reader/FloatStreamReader.java | 156 ------ .../orc/reader/JsonStreamReader.java | 180 ------- .../reader/LongDictionaryStreamReader.java | 210 -------- .../orc/reader/LongDirectStreamReader.java | 155 ------ .../orc/reader/LongStreamReader.java | 88 ---- .../reader/SliceDictionaryStreamReader.java | 287 ----------- .../orc/reader/SliceDirectStreamReader.java | 198 -------- .../orc/reader/SliceStreamReader.java | 88 ---- .../thirdparty/orc/reader/StreamReaders.java | 58 --- .../orc/reader/TimestampStreamReader.java | 217 --------- .../orc/stream/ByteArrayStream.java | 67 --- .../thirdparty/orc/stream/ByteStream.java | 134 ------ .../orc/stream/CheckpointStreamSource.java | 69 --- .../thirdparty/orc/stream/DoubleStream.java | 104 ---- .../thirdparty/orc/stream/FloatStream.java | 109 ----- .../thirdparty/orc/stream/LongDecode.java | 177 ------- .../thirdparty/orc/stream/LongStreamDwrf.java | 129 ----- .../thirdparty/orc/stream/LongStreamV1.java | 184 ------- .../thirdparty/orc/stream/LongStreamV2.java | 452 ------------------ .../thirdparty/orc/stream/OrcInputStream.java | 274 ----------- .../thirdparty/orc/stream/OrcStreamUtils.java | 61 --- .../thirdparty/orc/stream/StreamSources.java | 56 --- 76 files changed, 28 insertions(+), 9389 deletions(-) delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 5b36262539..7a63f67dc1 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -392,6 +392,11 @@ jackson-core 2.4.2 + + com.facebook.presto + presto-orc + 0.86 + com.facebook.hive hive-dwrf diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 75add87a73..d72c968fc1 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -32,9 +32,10 @@ import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.VTuple; import org.apache.tajo.storage.fragment.Fragment; -import org.apache.tajo.storage.thirdparty.orc.*; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnStatistics; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcMetadataReader; +import com.facebook.presto.orc.*; +import com.facebook.presto.orc.metadata.ColumnStatistics; +import com.facebook.presto.orc.metadata.OrcMetadataReader; +import org.apache.tajo.storage.thirdparty.orc.HdfsOrcDataSource; import org.apache.tajo.util.datetime.DateTimeUtil; import org.joda.time.DateTimeZone; @@ -107,7 +108,7 @@ public void init() throws IOException { this.fragment.getPath().toString(), fis, fs.getFileStatus(path).getLen(), - 200000000); + 100000000); // creating vectors for buffering vectors = new Vector[schema.size()]; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java deleted file mode 100644 index aaa1ada35c..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanVector.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.annotations.VisibleForTesting; - -public class BooleanVector - implements Vector -{ - public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; - public final boolean[] vector = new boolean[MAX_VECTOR_LENGTH]; - - @Override - @VisibleForTesting - public ObjectVector toObjectVector(int size) - { - ObjectVector objectVector = new ObjectVector(); - for (int i = 0; i < size; i++) { - if (!isNull[i]) { - objectVector.vector[i] = vector[i]; - } - } - return objectVector; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java deleted file mode 100644 index 8a3f249c3f..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DiskRange.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.primitives.Ints; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; - -public final class DiskRange -{ - private final long offset; - private final int length; - - public DiskRange(long offset, int length) - { - checkArgument(offset >= 0, "offset is negative"); - checkArgument(length >= 0, "length is negative"); - - this.offset = offset; - this.length = length; - } - - public long getOffset() - { - return offset; - } - - public int getLength() - { - return length; - } - - public long getEnd() - { - return offset + length; - } - - public boolean contains(DiskRange diskRange) - { - return offset <= diskRange.getOffset() && diskRange.getEnd() <= getEnd(); - } - - /** - * Returns the minimal DiskRange that encloses both this DiskRange - * and otherDiskRange. If there was a gap between the ranges the - * new range will cover that gap. - */ - public DiskRange span(DiskRange otherDiskRange) - { - checkNotNull(otherDiskRange, "otherDiskRange is null"); - long start = Math.min(this.offset, otherDiskRange.getOffset()); - long end = Math.max(getEnd(), otherDiskRange.getEnd()); - return new DiskRange(start, Ints.checkedCast(end - start)); - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("offset", offset) - .add("length", length) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java deleted file mode 100644 index 8f20d29590..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleVector.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.annotations.VisibleForTesting; - -public class DoubleVector - implements Vector -{ - public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; - public final double[] vector = new double[MAX_VECTOR_LENGTH]; - - @Override - @VisibleForTesting - public ObjectVector toObjectVector(int size) - { - ObjectVector objectVector = new ObjectVector(); - for (int i = 0; i < size; i++) { - if (!isNull[i]) { - objectVector.vector[i] = vector[i]; - } - } - return objectVector; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java index 6b04204668..dcc134705b 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/FileOrcDataSource.java @@ -13,6 +13,8 @@ */ package org.apache.tajo.storage.thirdparty.orc; +import com.facebook.presto.orc.DiskRange; +import com.facebook.presto.orc.OrcDataSource; import com.google.common.collect.ImmutableMap; import io.airlift.slice.Slice; import io.airlift.units.DataSize; @@ -25,9 +27,14 @@ import java.util.Map.Entry; import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.getDiskRangeSlice; -import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; +import static com.facebook.presto.orc.OrcDataSourceUtils.getDiskRangeSlice; +import static com.facebook.presto.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; +/** + * File data source class for Orc Reader + * + * Most of code is from Presto + */ public class FileOrcDataSource implements OrcDataSource { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java index 16414d2016..73ea47538d 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/HdfsOrcDataSource.java @@ -14,8 +14,8 @@ */ package org.apache.tajo.storage.thirdparty.orc; -import org.apache.tajo.storage.thirdparty.orc.DiskRange; -import org.apache.tajo.storage.thirdparty.orc.OrcDataSource; +import com.facebook.presto.orc.DiskRange; +import com.facebook.presto.orc.OrcDataSource; import com.google.common.collect.ImmutableMap; import io.airlift.slice.Slice; import io.airlift.units.DataSize; @@ -26,11 +26,16 @@ import java.util.Map; import java.util.Map.Entry; -import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.getDiskRangeSlice; -import static org.apache.tajo.storage.thirdparty.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; +import static com.facebook.presto.orc.OrcDataSourceUtils.getDiskRangeSlice; +import static com.facebook.presto.orc.OrcDataSourceUtils.mergeAdjacentDiskRanges; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; +/** + * HDFS File data source class for Orc Reader + * + * Most of code is from Presto + */ public class HdfsOrcDataSource implements OrcDataSource { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java deleted file mode 100644 index 7c9407a3e6..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/LongVector.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.annotations.VisibleForTesting; - -public class LongVector - implements Vector -{ - public final boolean[] isNull = new boolean[MAX_VECTOR_LENGTH]; - public final long[] vector = new long[MAX_VECTOR_LENGTH]; - - @Override - @VisibleForTesting - public ObjectVector toObjectVector(int size) - { - ObjectVector objectVector = new ObjectVector(); - for (int i = 0; i < size; i++) { - if (!isNull[i]) { - objectVector.vector[i] = vector[i]; - } - } - return objectVector; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java deleted file mode 100644 index 19f9608f7d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ObjectVector.java +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.annotations.VisibleForTesting; - -public class ObjectVector - implements Vector -{ - public final Object[] vector = new Object[MAX_VECTOR_LENGTH]; - - @Override - @VisibleForTesting - public ObjectVector toObjectVector(int size) - { - return this; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java deleted file mode 100644 index c780bcb51f..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcCorruptionException.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.jetbrains.annotations.Contract; - -import java.io.IOException; - -import static java.lang.String.format; - -public class OrcCorruptionException - extends IOException -{ - @Contract("false, _, _ -> fail") - public static void verifyFormat(boolean test, String messageFormat, Object... args) - throws OrcCorruptionException - { - if (!test) { - throw new OrcCorruptionException(messageFormat, args); - } - } - - public OrcCorruptionException(String messageFormat, Object... args) - { - super(format(messageFormat, args)); - } - - public OrcCorruptionException(Throwable cause, String messageFormat, Object... args) - { - super(format(messageFormat, args), cause); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java deleted file mode 100644 index 8eb1cbdd00..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSource.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import io.airlift.slice.Slice; - -import java.io.Closeable; -import java.io.IOException; -import java.util.Map; - -public interface OrcDataSource - extends Closeable -{ - long getReadTimeNanos(); - - long getSize(); - - void readFully(long position, byte[] buffer) - throws IOException; - - void readFully(long position, byte[] buffer, int bufferOffset, int bufferLength) - throws IOException; - - Map readFully(Map diskRanges) - throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java deleted file mode 100644 index ba65c3c55c..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcDataSourceUtils.java +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.primitives.Ints; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; -import io.airlift.units.DataSize; - -import java.util.*; -import java.util.Map.Entry; - -import static com.google.common.collect.Lists.newArrayList; - -public final class OrcDataSourceUtils -{ - private OrcDataSourceUtils() - { - } - - /** - * Merge disk ranges that are closer than {@code maxMergeDistance}. - */ - public static Iterable mergeAdjacentDiskRanges(Iterable diskRanges, DataSize maxMergeDistance) - { - // sort ranges by start offset - List ranges = newArrayList(diskRanges); - Collections.sort(ranges, new Comparator() { - @Override - public int compare(DiskRange o1, DiskRange o2) { - return Long.compare(o1.getOffset(), o2.getOffset()); - } - }); - - // merge overlapping ranges - long maxMergeDistanceBytes = maxMergeDistance.toBytes(); - List result = new ArrayList(); - DiskRange last = ranges.get(0); - for (int i = 1; i < ranges.size(); i++) { - DiskRange current = ranges.get(i); - if (last.getEnd() + maxMergeDistanceBytes + 1 >= current.getOffset()) { - last = last.span(current); - } - else { - result.add(last); - last = current; - } - } - result.add(last); - - return result; - } - - /** - * Get a slice for the disk range from the provided buffers. The buffers ranges do not have - * to exactly match {@code diskRange}, but {@code diskRange} must be completely contained within - * one of the buffer ranges. - */ - public static Slice getDiskRangeSlice(DiskRange diskRange, Map buffers) - { - for (Entry bufferEntry : buffers.entrySet()) { - DiskRange bufferRange = bufferEntry.getKey(); - byte[] buffer = bufferEntry.getValue(); - if (bufferRange.contains(diskRange)) { - int offset = Ints.checkedCast(diskRange.getOffset() - bufferRange.getOffset()); - return Slices.wrappedBuffer(buffer, offset, diskRange.getLength()); - } - } - throw new IllegalStateException("No matching buffer for disk range"); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java deleted file mode 100644 index 144baa5e7b..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcReader.java +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.base.Joiner; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.metadata.*; -import org.apache.tajo.storage.thirdparty.orc.stream.OrcInputStream; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; -import org.joda.time.DateTimeZone; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.Set; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; -import static io.airlift.slice.SizeOf.SIZE_OF_BYTE; - -public class OrcReader -{ - private static final Slice MAGIC = Slices.utf8Slice("ORC"); - private static final int CURRENT_MAJOR_VERSION = 0; - private static final int CURRENT_MINOR_VERSION = 12; - private static final int EXPECTED_FOOTER_SIZE = 16 * 1024; - - private final OrcDataSource orcDataSource; - private final MetadataReader metadataReader; - private final CompressionKind compressionKind; - private final int bufferSize; - private final Footer footer; - private final Metadata metadata; - - // This is based on the Apache Hive ORC code - public OrcReader(OrcDataSource orcDataSource, MetadataReader metadataReader) - throws IOException - { - this.orcDataSource = checkNotNull(orcDataSource, "orcDataSource is null"); - this.metadataReader = checkNotNull(metadataReader, "metadataReader is null"); - - // - // Read the file tail: - // - // variable: Footer - // variable: Metadata - // variable: PostScript - contains length of footer and metadata - // 3 bytes: file magic "ORC" - // 1 byte: postScriptSize = PostScript + Magic - - // figure out the size of the file using the option or filesystem - long size = orcDataSource.getSize(); - - // Read the tail of the file - byte[] buffer = new byte[(int) Math.min(size, EXPECTED_FOOTER_SIZE)]; - orcDataSource.readFully(size - buffer.length, buffer); - - // get length of PostScript - last byte of the file - int postScriptSize = buffer[buffer.length - SIZE_OF_BYTE] & 0xff; - - // make sure this is an ORC file and not an RCFile or something else - verifyOrcFooter(orcDataSource, postScriptSize, buffer); - - // decode the post script - int postScriptOffset = buffer.length - SIZE_OF_BYTE - postScriptSize; - PostScript postScript = metadataReader.readPostScript(buffer, postScriptOffset, postScriptSize); - - // verify this is a supported version - checkOrcVersion(orcDataSource, postScript.getVersion()); - - // check compression codec is supported - this.compressionKind = postScript.getCompression(); - - this.bufferSize = Ints.checkedCast(postScript.getCompressionBlockSize()); - - int footerSize = Ints.checkedCast(postScript.getFooterLength()); - int metadataSize = Ints.checkedCast(postScript.getMetadataLength()); - - // check if extra bytes need to be read - Slice completeFooterSlice; - int completeFooterSize = footerSize + metadataSize + postScriptSize + SIZE_OF_BYTE; - if (completeFooterSize > buffer.length) { - // allocate a new buffer large enough for the complete footer - byte[] newBuffer = new byte[completeFooterSize]; - completeFooterSlice = Slices.wrappedBuffer(newBuffer); - - // initial read was not large enough, so read missing section - orcDataSource.readFully(size - completeFooterSize, newBuffer, 0, completeFooterSize - buffer.length); - - // copy already read bytes into the new buffer - completeFooterSlice.setBytes(completeFooterSize - buffer.length, buffer); - } - else { - // footer is already in the bytes in buffer, just adjust position, length - completeFooterSlice = Slices.wrappedBuffer(buffer, buffer.length - completeFooterSize, completeFooterSize); - } - - // read metadata - Slice metadataSlice = completeFooterSlice.slice(0, metadataSize); - InputStream metadataInputStream = new OrcInputStream(orcDataSource.toString(), metadataSlice.getInput(), compressionKind, bufferSize); - this.metadata = metadataReader.readMetadata(metadataInputStream); - - // read footer - Slice footerSlice = completeFooterSlice.slice(metadataSize, footerSize); - InputStream footerInputStream = new OrcInputStream(orcDataSource.toString(), footerSlice.getInput(), compressionKind, bufferSize); - this.footer = metadataReader.readFooter(footerInputStream); - } - - public List getColumnNames() - { - return footer.getTypes().get(0).getFieldNames(); - } - - public Footer getFooter() - { - return footer; - } - - public Metadata getMetadata() - { - return metadata; - } - - public CompressionKind getCompressionKind() - { - return compressionKind; - } - - public int getBufferSize() - { - return bufferSize; - } - - public OrcRecordReader createRecordReader( - Set includedColumns, - OrcPredicate predicate, - long offset, - long length, - DateTimeZone hiveStorageTimeZone) - throws IOException - { - return new OrcRecordReader( - checkNotNull(includedColumns, "includedColumns is null"), - checkNotNull(predicate, "predicate is null"), - footer.getNumberOfRows(), - footer.getStripes(), - footer.getFileStats(), - metadata.getStripeStatsList(), - orcDataSource, - offset, - length, - footer.getTypes(), - compressionKind, - bufferSize, - footer.getRowsInRowGroup(), - checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"), - metadataReader); - } - - /** - * Verify this is an ORC file to prevent users from trying to read text - * files or RC files as ORC files. - */ - // This is based on the Apache Hive ORC code - private static void verifyOrcFooter( - OrcDataSource source, - int postScriptSize, - byte[] buffer) - throws IOException - { - int magicLength = MAGIC.length(); - checkArgument(postScriptSize >= magicLength + 1, "Malformed ORC file %s. Invalid postscript length %s", source, postScriptSize); - - if (!MAGIC.equals(Slices.wrappedBuffer(buffer, buffer.length - 1 - magicLength, magicLength))) { - // Old versions of ORC (0.11) wrote the magic to the head of the file - byte[] headerMagic = new byte[magicLength]; - source.readFully(0, headerMagic); - - // if it isn't there, this isn't an ORC file - checkArgument(MAGIC.equals(Slices.wrappedBuffer(headerMagic)), "Malformed ORC file %s. Invalid postscript.", source); - } - } - - /** - * Check to see if this ORC file is from a future version and if so, - * warn the user that we may not be able to read all of the column encodings. - */ - // This is based on the Apache Hive ORC code - private static void checkOrcVersion(OrcDataSource orcDataSource, List version) - { - if (version.size() >= 1) { - int major = version.get(0); - int minor = 0; - if (version.size() > 1) { - minor = version.get(1); - } - - if (major > CURRENT_MAJOR_VERSION || (major == CURRENT_MAJOR_VERSION && minor > CURRENT_MINOR_VERSION)) { - System.err.println(String.format("ORC file %s was written by a newer Hive version %s. This file may not be readable by this version of Hive (%s.%s).", - orcDataSource, - Joiner.on('.').join(version), - CURRENT_MAJOR_VERSION, - CURRENT_MINOR_VERSION)); - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java deleted file mode 100644 index 9f0e78300d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcRecordReader.java +++ /dev/null @@ -1,321 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.metadata.*; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.reader.StreamReader; -import org.apache.tajo.storage.thirdparty.orc.reader.StreamReaders; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTimeZone; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; - -public class OrcRecordReader -{ - private final OrcDataSource orcDataSource; - - private final StreamReader[] streamReaders; - - private final long totalRowCount; - private final long splitLength; - private final Set presentColumns; - private long currentPosition; - - private final List stripes; - private final StripeReader stripeReader; - private int currentStripe = -1; - - private Iterator rowGroups = ImmutableList.of().iterator(); - private long currentGroupRowCount; - private long nextRowInGroup; - - public OrcRecordReader( - Set includedColumns, - OrcPredicate predicate, - long numberOfRows, - List fileStripes, - List fileStats, - List stripeStats, - OrcDataSource orcDataSource, - long splitOffset, - long splitLength, - List types, - CompressionKind compressionKind, - int bufferSize, - int rowsInRowGroup, - DateTimeZone hiveStorageTimeZone, - MetadataReader metadataReader) - throws IOException - { - checkNotNull(includedColumns, "includedColumns is null"); - checkNotNull(predicate, "predicate is null"); - checkNotNull(fileStripes, "fileStripes is null"); - checkNotNull(stripeStats, "stripeStats is null"); - checkNotNull(orcDataSource, "orcDataSource is null"); - checkNotNull(types, "types is null"); - checkNotNull(compressionKind, "compressionKind is null"); - checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null"); - - // reduce the included columns to the set that is also present - ImmutableSet.Builder presentColumns = ImmutableSet.builder(); - OrcType root = types.get(0); - for (int includedColumn : includedColumns) { - // an old file can have less columns since columns can be added - // after the file was written - if (includedColumn < root.getFieldCount()) { - presentColumns.add(includedColumn); - } - } - this.presentColumns = presentColumns.build(); - - this.orcDataSource = orcDataSource; - this.splitLength = splitLength; - - // it is possible that old versions of orc use 0 to mean there are no row groups - checkArgument(rowsInRowGroup > 0, "rowsInRowGroup must be greater than zero"); - - long totalRowCount = 0; - ImmutableList.Builder stripes = ImmutableList.builder(); - if (predicate.matches(numberOfRows, getStatisticsByColumnOrdinal(root, fileStats))) { - // select stripes that start within the specified split - for (int stripeIndex = 0; stripeIndex < fileStripes.size(); stripeIndex++) { - StripeInformation stripe = fileStripes.get(stripeIndex); - if (splitContainsStripe(splitOffset, splitLength, stripe) && isStripeIncluded(root, stripe, stripeStats, predicate, stripeIndex)) { - stripes.add(stripe); - totalRowCount += stripe.getNumberOfRows(); - } - } - } - this.totalRowCount = totalRowCount; - this.stripes = stripes.build(); - - stripeReader = new StripeReader( - orcDataSource, - compressionKind, - types, - bufferSize, - this.presentColumns, - rowsInRowGroup, - predicate, - metadataReader); - - streamReaders = createStreamReaders(orcDataSource, types, hiveStorageTimeZone, this.presentColumns); - } - - private static boolean splitContainsStripe(long splitOffset, long splitLength, StripeInformation stripe) - { - long splitEndOffset = splitOffset + splitLength; - return splitOffset <= stripe.getOffset() && stripe.getOffset() < splitEndOffset; - } - - private static boolean isStripeIncluded( - OrcType rootStructType, - StripeInformation stripe, - List stripeStats, - OrcPredicate predicate, - int stripeIndex) - { - // if there are no stats, include the column - if (stripeIndex >= stripeStats.size()) { - return true; - } - - return predicate.matches(stripe.getNumberOfRows(), getStatisticsByColumnOrdinal(rootStructType, stripeStats.get(stripeIndex).getColumnStatistics())); - } - - public long getPosition() - { - return currentPosition; - } - - public long getTotalRowCount() - { - return totalRowCount; - } - - public float getProgress() - { - return ((float) currentPosition) / totalRowCount; - } - - public long getSplitLength() - { - return splitLength; - } - - public void close() - throws IOException - { - orcDataSource.close(); - } - - public boolean isColumnPresent(int hiveColumnIndex) - { - return presentColumns.contains(hiveColumnIndex); - } - - public int nextBatch() - throws IOException - { - // if next row is within the current group return - if (nextRowInGroup >= currentGroupRowCount) { - // attempt to advance to next row group - if (!advanceToNextRowGroup()) { - return -1; - } - } - - int batchSize = Ints.checkedCast(Math.min(Vector.MAX_VECTOR_LENGTH, currentGroupRowCount - nextRowInGroup)); - - for (StreamReader column : streamReaders) { - if (column != null) { - column.prepareNextRead(batchSize); - } - } - nextRowInGroup += batchSize; - currentPosition += batchSize; - return batchSize; - } - - public void readVector(int columnIndex, Object vector) - throws IOException - { - streamReaders[columnIndex].readBatch(vector); - } - - private boolean advanceToNextRowGroup() - throws IOException - { - nextRowInGroup = 0; - - while (!rowGroups.hasNext() && currentStripe < stripes.size()) { - advanceToNextStripe(); - } - - if (!rowGroups.hasNext()) { - currentGroupRowCount = 0; - return false; - } - - RowGroup currentRowGroup = rowGroups.next(); - currentGroupRowCount = currentRowGroup.getRowCount(); - - // give reader data streams from row group - StreamSources rowGroupStreamSources = currentRowGroup.getStreamSources(); - for (StreamReader column : streamReaders) { - if (column != null) { - column.startRowGroup(rowGroupStreamSources); - } - } - - return true; - } - - private void advanceToNextStripe() - throws IOException - { - currentStripe++; - if (currentStripe >= stripes.size()) { - return; - } - - StripeInformation stripeInformation = stripes.get(currentStripe); - Stripe stripe = stripeReader.readStripe(stripeInformation); - if (stripe != null) { - // Give readers access to dictionary streams - StreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources(); - List columnEncodings = stripe.getColumnEncodings(); - for (StreamReader column : streamReaders) { - if (column != null) { - column.startStripe(dictionaryStreamSources, columnEncodings); - } - } - - rowGroups = stripe.getRowGroups().iterator(); - } - else { - rowGroups = ImmutableList.of().iterator(); - } - } - - private static StreamReader[] createStreamReaders(OrcDataSource orcDataSource, - List types, - DateTimeZone hiveStorageTimeZone, - Set includedColumns) - { - List streamDescriptors = createStreamDescriptor("", "", 0, types, orcDataSource).getNestedStreams(); - - OrcType rowType = types.get(0); - StreamReader[] streamReaders = new StreamReader[rowType.getFieldCount()]; - for (int columnId = 0; columnId < rowType.getFieldCount(); columnId++) { - if (includedColumns.contains(columnId)) { - StreamDescriptor streamDescriptor = streamDescriptors.get(columnId); - streamReaders[columnId] = StreamReaders.createStreamReader(streamDescriptor, hiveStorageTimeZone); - } - } - return streamReaders; - } - - private static StreamDescriptor createStreamDescriptor(String parentStreamName, String fieldName, int typeId, List types, OrcDataSource dataSource) - { - OrcType type = types.get(typeId); - - if (!fieldName.isEmpty()) { - parentStreamName += "." + fieldName; - } - - ImmutableList.Builder nestedStreams = ImmutableList.builder(); - if (type.getOrcTypeKind() == OrcTypeKind.STRUCT) { - for (int i = 0; i < type.getFieldCount(); ++i) { - nestedStreams.add(createStreamDescriptor(parentStreamName, type.getFieldName(i), type.getFieldTypeIndex(i), types, dataSource)); - } - } - else if (type.getOrcTypeKind() == OrcTypeKind.LIST) { - nestedStreams.add(createStreamDescriptor(parentStreamName, "item", type.getFieldTypeIndex(0), types, dataSource)); - } - else if (type.getOrcTypeKind() == OrcTypeKind.MAP) { - nestedStreams.add(createStreamDescriptor(parentStreamName, "key", type.getFieldTypeIndex(0), types, dataSource)); - nestedStreams.add(createStreamDescriptor(parentStreamName, "value", type.getFieldTypeIndex(1), types, dataSource)); - } - return new StreamDescriptor(parentStreamName, typeId, fieldName, type.getOrcTypeKind(), dataSource, nestedStreams.build()); - } - - private static Map getStatisticsByColumnOrdinal(OrcType rootStructType, List fileStats) - { - checkNotNull(rootStructType, "rootStructType is null"); - checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); - checkNotNull(fileStats, "fileStats is null"); - - ImmutableMap.Builder statistics = ImmutableMap.builder(); - for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { - ColumnStatistics element = fileStats.get(rootStructType.getFieldTypeIndex(ordinal)); - if (element != null) { - statistics.put(ordinal, element); - } - } - return statistics.build(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java deleted file mode 100644 index 01cfbfca80..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SliceVector.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.annotations.VisibleForTesting; -import io.airlift.slice.Slice; - -public class SliceVector - implements Vector -{ - public final Slice[] vector = new Slice[MAX_VECTOR_LENGTH]; - - @Override - @VisibleForTesting - public ObjectVector toObjectVector(int size) - { - ObjectVector objectVector = new ObjectVector(); - for (int i = 0; i < size; i++) { - if (vector[i] != null) { - objectVector.vector[i] = vector[i].toStringUtf8(); - } - } - return objectVector; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java deleted file mode 100644 index a8108e6f36..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamDescriptor.java +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.collect.ImmutableList; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; - -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; - -public final class StreamDescriptor -{ - private final String streamName; - private final int streamId; - private final OrcTypeKind streamType; - private final String fieldName; - private final OrcDataSource fileInput; - private final List nestedStreams; - - public StreamDescriptor(String streamName, int streamId, String fieldName, OrcTypeKind streamType, OrcDataSource fileInput, List nestedStreams) - { - this.streamName = checkNotNull(streamName, "streamName is null"); - this.streamId = streamId; - this.fieldName = checkNotNull(fieldName, "fieldName is null"); - this.streamType = checkNotNull(streamType, "type is null"); - this.fileInput = checkNotNull(fileInput, "fileInput is null"); - this.nestedStreams = ImmutableList.copyOf(checkNotNull(nestedStreams, "nestedStreams is null")); - } - - public String getStreamName() - { - return streamName; - } - - public int getStreamId() - { - return streamId; - } - - public OrcTypeKind getStreamType() - { - return streamType; - } - - public String getFieldName() - { - return fieldName; - } - - public OrcDataSource getFileInput() - { - return fileInput; - } - - public List getNestedStreams() - { - return nestedStreams; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("streamName", streamName) - .add("streamId", streamId) - .add("streamType", streamType) - .add("path", fileInput) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java deleted file mode 100644 index 3cec23c247..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamId.java +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; - -import java.util.Objects; - -import static com.google.common.base.MoreObjects.toStringHelper; - -public final class StreamId -{ - private final int column; - private final StreamKind streamKind; - - public StreamId(Stream stream) - { - this.column = stream.getColumn(); - this.streamKind = stream.getStreamKind(); - } - - public StreamId(int column, StreamKind streamKind) - { - this.column = column; - this.streamKind = streamKind; - } - - public int getColumn() - { - return column; - } - - public StreamKind getStreamKind() - { - return streamKind; - } - - @Override - public int hashCode() - { - return Objects.hash(column, streamKind); - } - - @Override - public boolean equals(Object obj) - { - if (this == obj) { - return true; - } - if (obj == null || getClass() != obj.getClass()) { - return false; - } - StreamId other = (StreamId) obj; - return Objects.equals(this.column, other.column) && Objects.equals(this.streamKind, other.streamKind); - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("column", column) - .add("streamKind", streamKind) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java deleted file mode 100644 index a95353160e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Stripe.java +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.collect.ImmutableList; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; - -public class Stripe -{ - private final long rowCount; - private final List columnEncodings; - private final List rowGroups; - private final StreamSources dictionaryStreamSources; - - public Stripe(long rowCount, List columnEncodings, List rowGroups, StreamSources dictionaryStreamSources) - { - this.rowCount = rowCount; - this.columnEncodings = checkNotNull(columnEncodings, "columnEncodings is null"); - this.rowGroups = ImmutableList.copyOf(checkNotNull(rowGroups, "rowGroups is null")); - this.dictionaryStreamSources = checkNotNull(dictionaryStreamSources, "dictionaryStreamSources is null"); - } - - public long getRowCount() - { - return rowCount; - } - - public List getColumnEncodings() - { - return columnEncodings; - } - - public List getRowGroups() - { - return rowGroups; - } - - public StreamSources getDictionaryStreamSources() - { - return dictionaryStreamSources; - } - - @Override - public String toString() - { - return toStringHelper(this) - .add("rowCount", rowCount) - .add("columnEncodings", columnEncodings) - .add("rowGroups", rowGroups) - .add("dictionaryStreams", dictionaryStreamSources) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java deleted file mode 100644 index 1e4c4bc273..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeReader.java +++ /dev/null @@ -1,352 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import com.google.common.base.Function; -import com.google.common.base.Predicates; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Maps; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.metadata.*; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.stream.*; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; - -import java.io.IOException; -import java.io.InputStream; -import java.util.*; -import java.util.Map.Entry; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.getStreamCheckpoints; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.DICTIONARY_V2; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.CheckpointStreamSource.createCheckpointStreamSource; - -public class StripeReader -{ - private final OrcDataSource orcDataSource; - private final CompressionKind compressionKind; - private final List types; - private final int bufferSize; - private final Set includedOrcColumns; - private final int rowsInRowGroup; - private final OrcPredicate predicate; - private final MetadataReader metadataReader; - - public StripeReader(OrcDataSource orcDataSource, - CompressionKind compressionKind, - List types, - int bufferSize, - Set includedColumns, - int rowsInRowGroup, - OrcPredicate predicate, - MetadataReader metadataReader) - { - this.orcDataSource = checkNotNull(orcDataSource, "orcDataSource is null"); - this.compressionKind = checkNotNull(compressionKind, "compressionKind is null"); - this.types = ImmutableList.copyOf(checkNotNull(types, "types is null")); - this.bufferSize = bufferSize; - this.includedOrcColumns = getIncludedOrcColumns(types, checkNotNull(includedColumns, "includedColumns is null")); - this.rowsInRowGroup = rowsInRowGroup; - this.predicate = checkNotNull(predicate, "predicate is null"); - this.metadataReader = checkNotNull(metadataReader, "metadataReader is null"); - } - - public Stripe readStripe(StripeInformation stripe) - throws IOException - { - // read the stripe footer - StripeFooter stripeFooter = readStripeFooter(stripe); - List columnEncodings = stripeFooter.getColumnEncodings(); - - // get streams for selected columns - Map streams = new HashMap(); - for (Stream stream : stripeFooter.getStreams()) { - if (includedOrcColumns.contains(stream.getColumn())) { - streams.put(new StreamId(stream), stream); - } - } - - // determine ranges of the stripe to read - Map diskRanges = getDiskRanges(stripeFooter.getStreams()); - diskRanges = Maps.filterKeys(diskRanges, Predicates.in(streams.keySet())); - - // read the file regions - Map streamsData = readDiskRanges(stripe.getOffset(), diskRanges); - - // read the row index for each column - Map> columnIndexes = readColumnIndexes(streams, streamsData); - - // select the row groups matching the tuple domain - Set selectedRowGroups = selectRowGroups(stripe, columnIndexes); - - // if all row groups are skipped, return null - if (selectedRowGroups.isEmpty()) { - return null; - } - - // value streams - Map> valueStreams = createValueStreams(streams, streamsData, columnEncodings); - - // build the dictionary streams - StreamSources dictionaryStreamSources = createDictionaryStreamSources(streams, valueStreams, columnEncodings); - - // build the row groups - List rowGroups = createRowGroups( - stripe.getNumberOfRows(), - streams, - valueStreams, - columnIndexes, - selectedRowGroups, - columnEncodings); - - return new Stripe(stripe.getNumberOfRows(), columnEncodings, rowGroups, dictionaryStreamSources); - } - - public Map readDiskRanges(final long stripeOffset, Map diskRanges) - throws IOException - { - // transform ranges to have an absolute offset in file - diskRanges = Maps.transformValues(diskRanges, new Function() { - @Override - public DiskRange apply(DiskRange diskRange) - { - return new DiskRange(stripeOffset + diskRange.getOffset(), diskRange.getLength()); - } - }); - - Map streamsData = orcDataSource.readFully(diskRanges); - - return ImmutableMap.copyOf(Maps.transformValues(streamsData, new Function() - { - @Override - public OrcInputStream apply(Slice input) - { - return new OrcInputStream(orcDataSource.toString(), input.getInput(), compressionKind, bufferSize); - } - })); - } - - private Map> createValueStreams(Map streams, Map streamsData, List columnEncodings) - { - ImmutableMap.Builder> valueStreams = ImmutableMap.builder(); - for (Entry entry : streams.entrySet()) { - StreamId streamId = entry.getKey(); - Stream stream = entry.getValue(); - ColumnEncodingKind columnEncoding = columnEncodings.get(stream.getColumn()).getColumnEncodingKind(); - - // skip index and empty streams - if (isIndexStream(stream) || stream.getLength() == 0) { - continue; - } - - OrcInputStream inputStream = streamsData.get(streamId); - OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); - - valueStreams.put(streamId, ValueStreams.createValueStreams(streamId, inputStream, columnType, columnEncoding, stream.isUseVInts())); - } - return valueStreams.build(); - } - - public StreamSources createDictionaryStreamSources(Map streams, Map> valueStreams, List columnEncodings) - { - ImmutableMap.Builder> dictionaryStreamBuilder = ImmutableMap.builder(); - for (Entry entry : streams.entrySet()) { - StreamId streamId = entry.getKey(); - Stream stream = entry.getValue(); - int column = stream.getColumn(); - - // only process dictionary streams - ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind(); - if (!isDictionary(stream, columnEncoding)) { - continue; - } - - // skip streams without data - ValueStream valueStream = valueStreams.get(streamId); - if (valueStream == null) { - continue; - } - - OrcTypeKind columnType = types.get(stream.getColumn()).getOrcTypeKind(); - StreamCheckpoint streamCheckpoint = getDictionaryStreamCheckpoint(streamId, columnType, columnEncoding); - - StreamSource streamSource = createCheckpointStreamSource(valueStream, streamCheckpoint); - dictionaryStreamBuilder.put(streamId, streamSource); - } - return new StreamSources(dictionaryStreamBuilder.build()); - } - - private List createRowGroups( - int rowsInStripe, - Map streams, - Map> valueStreams, - Map> columnIndexes, - Set selectedRowGroups, - List encodings) - { - ImmutableList.Builder rowGroupBuilder = ImmutableList.builder(); - - for (int rowGroupId : selectedRowGroups) { - Map checkpoints = getStreamCheckpoints(includedOrcColumns, types, compressionKind, rowGroupId, encodings, streams, columnIndexes); - int rowsInGroup = Math.min(rowsInStripe - (rowGroupId * rowsInRowGroup), rowsInRowGroup); - rowGroupBuilder.add(createRowGroup(rowGroupId, rowsInGroup, valueStreams, checkpoints)); - } - - return rowGroupBuilder.build(); - } - - public static RowGroup createRowGroup(int groupId, int rowCount, Map> valueStreams, Map checkpoints) - { - ImmutableMap.Builder> builder = ImmutableMap.builder(); - for (Entry entry : checkpoints.entrySet()) { - StreamId streamId = entry.getKey(); - StreamCheckpoint checkpoint = entry.getValue(); - - // skip streams without data - ValueStream valueStream = valueStreams.get(streamId); - if (valueStream == null) { - continue; - } - - builder.put(streamId, createCheckpointStreamSource(valueStream, checkpoint)); - } - StreamSources rowGroupStreams = new StreamSources(builder.build()); - return new RowGroup(groupId, rowCount, rowGroupStreams); - } - - public StripeFooter readStripeFooter(StripeInformation stripe) - throws IOException - { - long offset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength(); - int tailLength = Ints.checkedCast(stripe.getFooterLength()); - - // read the footer - byte[] tailBuffer = new byte[tailLength]; - orcDataSource.readFully(offset, tailBuffer); - InputStream inputStream = new OrcInputStream(orcDataSource.toString(), Slices.wrappedBuffer(tailBuffer).getInput(), compressionKind, bufferSize); - return metadataReader.readStripeFooter(types, inputStream); - } - - private Map> readColumnIndexes(Map streams, Map streamsData) - throws IOException - { - ImmutableMap.Builder> columnIndexes = ImmutableMap.builder(); - for (Entry entry : streams.entrySet()) { - Stream stream = entry.getValue(); - if (stream.getStreamKind() == ROW_INDEX) { - OrcInputStream inputStream = streamsData.get(entry.getKey()); - columnIndexes.put(stream.getColumn(), metadataReader.readRowIndexes(inputStream)); - } - } - return columnIndexes.build(); - } - - private Set selectRowGroups(StripeInformation stripe, Map> columnIndexes) - throws IOException - { - int rowsInStripe = Ints.checkedCast(stripe.getNumberOfRows()); - int groupsInStripe = ceil(rowsInStripe, rowsInRowGroup); - - ImmutableSet.Builder selectedRowGroups = ImmutableSet.builder(); - int remainingRows = rowsInStripe; - for (int rowGroup = 0; rowGroup < groupsInStripe; ++rowGroup) { - int rows = Math.min(remainingRows, rowsInRowGroup); - Map statistics = getRowGroupStatistics(types.get(0), columnIndexes, rowGroup); - if (predicate.matches(rows, statistics)) { - selectedRowGroups.add(rowGroup); - } - remainingRows -= rows; - } - return selectedRowGroups.build(); - } - - private static Map getRowGroupStatistics(OrcType rootStructType, Map> columnIndexes, int rowGroup) - { - checkNotNull(rootStructType, "rootStructType is null"); - checkArgument(rootStructType.getOrcTypeKind() == OrcTypeKind.STRUCT); - checkNotNull(columnIndexes, "columnIndexes is null"); - checkArgument(rowGroup >= 0, "rowGroup is negative"); - - ImmutableMap.Builder statistics = ImmutableMap.builder(); - for (int ordinal = 0; ordinal < rootStructType.getFieldCount(); ordinal++) { - List rowGroupIndexes = columnIndexes.get(rootStructType.getFieldTypeIndex(ordinal)); - if (rowGroupIndexes != null) { - statistics.put(ordinal, rowGroupIndexes.get(rowGroup).getColumnStatistics()); - } - } - return statistics.build(); - } - - private static boolean isIndexStream(Stream stream) - { - return stream.getStreamKind() == ROW_INDEX || stream.getStreamKind() == DICTIONARY_COUNT; - } - - private static boolean isDictionary(Stream stream, ColumnEncodingKind columnEncoding) - { - return stream.getStreamKind() == DICTIONARY_DATA || (stream.getStreamKind() == LENGTH && (columnEncoding == DICTIONARY || columnEncoding == DICTIONARY_V2)); - } - - private static Map getDiskRanges(List streams) - { - ImmutableMap.Builder streamDiskRanges = ImmutableMap.builder(); - long stripeOffset = 0; - for (Stream stream : streams) { - int streamLength = Ints.checkedCast(stream.getLength()); - streamDiskRanges.put(new StreamId(stream), new DiskRange(stripeOffset, streamLength)); - stripeOffset += streamLength; - } - return streamDiskRanges.build(); - } - - private static Set getIncludedOrcColumns(List types, Set includedColumns) - { - Set includes = new LinkedHashSet(); - - OrcType root = types.get(0); - for (int includedColumn : includedColumns) { - includeOrcColumnsRecursive(types, includes, root.getFieldTypeIndex(includedColumn)); - } - - return includes; - } - - private static void includeOrcColumnsRecursive(List types, Set result, int typeId) - { - result.add(typeId); - OrcType type = types.get(typeId); - int children = type.getFieldCount(); - for (int i = 0; i < children; ++i) { - includeOrcColumnsRecursive(types, result, type.getFieldTypeIndex(i)); - } - } - - /** - * Ceiling of integer division - */ - private static int ceil(int dividend, int divisor) - { - return ((dividend + divisor) - 1) / divisor; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java deleted file mode 100644 index 4fd403e643..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/BooleanStreamCheckpoint.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static com.google.common.base.Preconditions.checkNotNull; - -public final class BooleanStreamCheckpoint - implements StreamCheckpoint -{ - private final int offset; - private final ByteStreamCheckpoint byteStreamCheckpoint; - - public BooleanStreamCheckpoint(int offset, ByteStreamCheckpoint byteStreamCheckpoint) - { - this.offset = offset; - this.byteStreamCheckpoint = checkNotNull(byteStreamCheckpoint, "byteStreamCheckpoint is null"); - } - - public BooleanStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - byteStreamCheckpoint = new ByteStreamCheckpoint(compressionKind, positionsList); - offset = positionsList.nextPosition(); - } - - public int getOffset() - { - return offset; - } - - public ByteStreamCheckpoint getByteStreamCheckpoint() - { - return byteStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("offset", offset) - .add("byteStreamCheckpoint", byteStreamCheckpoint) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java deleted file mode 100644 index a76d5c286e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteArrayStreamCheckpoint.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class ByteArrayStreamCheckpoint - implements StreamCheckpoint -{ - private final long inputStreamCheckpoint; - - public ByteArrayStreamCheckpoint(long inputStreamCheckpoint) - { - this.inputStreamCheckpoint = inputStreamCheckpoint; - } - - public ByteArrayStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java deleted file mode 100644 index c7a93ea169..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/ByteStreamCheckpoint.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class ByteStreamCheckpoint - implements StreamCheckpoint -{ - private final int offset; - private final long inputStreamCheckpoint; - - public ByteStreamCheckpoint(int offset, long inputStreamCheckpoint) - { - this.offset = offset; - this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); - } - - public ByteStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - offset = positionsList.nextPosition(); - } - - public int getOffset() - { - return offset; - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("offset", offset) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java deleted file mode 100644 index f346235d94..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/Checkpoints.java +++ /dev/null @@ -1,405 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.collect.*; -import org.apache.tajo.storage.thirdparty.orc.StreamId; -import org.apache.tajo.storage.thirdparty.orc.metadata.*; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; - -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static com.google.common.base.Preconditions.checkNotNull; -import static com.google.common.base.Preconditions.checkState; -import static com.google.common.base.Predicates.equalTo; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; - -public final class Checkpoints -{ - private Checkpoints() - { - } - - public static Map getStreamCheckpoints( - Set columns, - List columnTypes, - CompressionKind compressionKind, - int rowGroupId, - List columnEncodings, - Map streams, - Map> columnIndexes) - { - ImmutableSetMultimap.Builder streamKindsBuilder = ImmutableSetMultimap.builder(); - for (Stream stream : streams.values()) { - streamKindsBuilder.put(stream.getColumn(), stream.getStreamKind()); - } - SetMultimap streamKinds = streamKindsBuilder.build(); - - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - for (int column : columns) { - List positionsList = columnIndexes.get(column).get(rowGroupId).getPositions(); - - ColumnEncodingKind columnEncoding = columnEncodings.get(column).getColumnEncodingKind(); - OrcTypeKind columnType = columnTypes.get(column).getOrcTypeKind(); - Set availableStreams = streamKinds.get(column); - - ColumnPositionsList columnPositionsList = new ColumnPositionsList(column, columnType, positionsList); - switch (columnType) { - case BOOLEAN: - checkpoints.putAll(getBooleanColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); - break; - case BYTE: - checkpoints.putAll(getByteColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); - break; - case SHORT: - case INT: - case LONG: - case DATE: - checkpoints.putAll(getLongColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); - break; - case FLOAT: - checkpoints.putAll(getFloatColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); - break; - case DOUBLE: - checkpoints.putAll(getDoubleColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); - break; - case TIMESTAMP: - checkpoints.putAll(getTimestampColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); - break; - case BINARY: - case STRING: - checkpoints.putAll(getSliceColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); - break; - case LIST: - case MAP: - checkpoints.putAll(getListOrMapColumnCheckpoints(column, columnEncoding, compressionKind, availableStreams, columnPositionsList)); - break; - case STRUCT: - checkpoints.putAll(getStructColumnCheckpoints(column, compressionKind, availableStreams, columnPositionsList)); - break; - case DECIMAL: - case CHAR: - case VARCHAR: - case UNION: - throw new IllegalArgumentException("Unsupported column type " + columnType); - } - - // The DWRF code is not meticulous in the handling of checkpoints. It appears that for the first row group - // it will write checkpoints for all streams, but in other cases it will write only the streams that exist. - // We detect this case by checking that all offsets in the initial position list are zero, and if so, we - // clear the extra offsets - checkState(!columnPositionsList.hasNextPosition() || Iterables.all(positionsList, equalTo(0)), - "Column %s, of type %s, contains %s offset positions, but only %s positions were consumed", - column, - columnType, - positionsList.size(), - columnPositionsList.getIndex()); - } - return checkpoints.build(); - } - - public static StreamCheckpoint getDictionaryStreamCheckpoint(StreamId streamId, OrcTypeKind columnType, ColumnEncodingKind columnEncoding) - { - if (streamId.getStreamKind() == DICTIONARY_DATA) { - switch (columnType) { - case SHORT: - case INT: - case LONG: - return new LongStreamDwrfCheckpoint(createInputStreamCheckpoint(0, 0)); - case STRING: - case VARCHAR: - case CHAR: - case BINARY: - return new ByteArrayStreamCheckpoint(createInputStreamCheckpoint(0, 0)); - } - } - - // dictionary length and data streams are unsigned long streams - if (streamId.getStreamKind() == LENGTH || streamId.getStreamKind() == DATA) { - if (columnEncoding == DICTIONARY_V2) { - return new LongStreamV2Checkpoint(0, createInputStreamCheckpoint(0, 0)); - } - else if (columnEncoding == DICTIONARY) { - return new LongStreamV1Checkpoint(0, createInputStreamCheckpoint(0, 0)); - } - } - throw new IllegalArgumentException("Unsupported column type " + columnType + " for dictionary stream " + streamId); - } - - private static Map getBooleanColumnCheckpoints( - int column, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getByteColumnCheckpoints( - int column, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), new ByteStreamCheckpoint(compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getLongColumnCheckpoints( - int column, - ColumnEncodingKind encoding, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(IN_DICTIONARY)) { - checkpoints.put(new StreamId(column, IN_DICTIONARY), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getFloatColumnCheckpoints( - int column, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), new FloatStreamCheckpoint(compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getDoubleColumnCheckpoints( - int column, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), new DoubleStreamCheckpoint(compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getTimestampColumnCheckpoints( - int column, - ColumnEncodingKind encoding, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - - if (availableStreams.contains(SECONDARY)) { - checkpoints.put(new StreamId(column, SECONDARY), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getSliceColumnCheckpoints( - int column, - ColumnEncodingKind encoding, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (encoding == DIRECT || encoding == DIRECT_V2) { - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), new ByteArrayStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(LENGTH)) { - checkpoints.put(new StreamId(column, LENGTH), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - } - else if (encoding == DICTIONARY || encoding == DICTIONARY_V2) { - // DWRF has rules inconsistent with the ORC style - if (availableStreams.contains(IN_DICTIONARY)) { - if (availableStreams.contains(ROW_GROUP_DICTIONARY)) { - checkpoints.put(new StreamId(column, ROW_GROUP_DICTIONARY), new ByteArrayStreamCheckpoint(compressionKind, positionsList)); - } - - checkpoints.put(new StreamId(column, ROW_GROUP_DICTIONARY_LENGTH), new RowGroupDictionaryLengthStreamCheckpoint(compressionKind, positionsList)); - - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - - checkpoints.put(new StreamId(column, IN_DICTIONARY), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - else { - if (availableStreams.contains(DATA)) { - checkpoints.put(new StreamId(column, DATA), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - } - } - else { - throw new IllegalArgumentException("Unsupported encoding for slice column: " + encoding); - } - - return checkpoints.build(); - } - - private static Map getListOrMapColumnCheckpoints( - int column, - ColumnEncodingKind encoding, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - if (availableStreams.contains(LENGTH)) { - checkpoints.put(new StreamId(column, LENGTH), createLongStreamCheckpoint(encoding, compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static Map getStructColumnCheckpoints( - int column, - CompressionKind compressionKind, - Set availableStreams, - ColumnPositionsList positionsList) - { - ImmutableMap.Builder checkpoints = ImmutableMap.builder(); - - if (availableStreams.contains(PRESENT)) { - checkpoints.put(new StreamId(column, PRESENT), new BooleanStreamCheckpoint(compressionKind, positionsList)); - } - - return checkpoints.build(); - } - - private static StreamCheckpoint createLongStreamCheckpoint(ColumnEncodingKind encoding, CompressionKind compressionKind, ColumnPositionsList positionsList) - { - if (encoding == DIRECT_V2 || encoding == DICTIONARY_V2) { - return new LongStreamV2Checkpoint(compressionKind, positionsList); - } - - if (encoding == DIRECT || encoding == DICTIONARY) { - return new LongStreamV1Checkpoint(compressionKind, positionsList); - } - - if (encoding == DWRF_DIRECT) { - return new LongStreamDwrfCheckpoint(compressionKind, positionsList); - } - - throw new IllegalArgumentException("Unsupported encoding for long stream: " + encoding); - } - - public static class ColumnPositionsList - { - private final int column; - private final OrcTypeKind columnType; - private final List positionsList; - private int index; - - private ColumnPositionsList(int column, OrcTypeKind columnType, List positionsList) - { - this.column = column; - this.columnType = checkNotNull(columnType, "columnType is null"); - this.positionsList = ImmutableList.copyOf(checkNotNull(positionsList, "positionsList is null")); - } - - public int getIndex() - { - return index; - } - - public boolean hasNextPosition() - { - return index < positionsList.size(); - } - - public int nextPosition() - { - checkState(hasNextPosition(), "Not enough positions for column %s, of type %s, checkpoints", - column, - columnType); - - return positionsList.get(index++); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java deleted file mode 100644 index 80f03de1d9..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/DoubleStreamCheckpoint.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class DoubleStreamCheckpoint - implements StreamCheckpoint -{ - private final long inputStreamCheckpoint; - - public DoubleStreamCheckpoint(long inputStreamCheckpoint) - { - this.inputStreamCheckpoint = inputStreamCheckpoint; - } - - public DoubleStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java deleted file mode 100644 index 2d92cd3494..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/FloatStreamCheckpoint.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class FloatStreamCheckpoint - implements StreamCheckpoint -{ - private final long inputStreamCheckpoint; - - public FloatStreamCheckpoint(long inputStreamCheckpoint) - { - this.inputStreamCheckpoint = inputStreamCheckpoint; - } - - public FloatStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java deleted file mode 100644 index 92550a6b91..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/InputStreamCheckpoint.java +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.UNCOMPRESSED; - -/** - * InputStreamCheckpoint is represented as a packed long to avoid object creation in inner loops. - */ -public final class InputStreamCheckpoint -{ - private InputStreamCheckpoint() - { - } - - public static long createInputStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - if (compressionKind == UNCOMPRESSED) { - return createInputStreamCheckpoint(0, positionsList.nextPosition()); - } - else { - return createInputStreamCheckpoint(positionsList.nextPosition(), positionsList.nextPosition()); - } - } - - public static long createInputStreamCheckpoint(int compressedBlockOffset, int decompressedOffset) - { - return (((long) compressedBlockOffset) << 32) | decompressedOffset; - } - - public static int decodeCompressedBlockOffset(long inputStreamCheckpoint) - { - return ((int) (inputStreamCheckpoint >> 32)); - } - - public static int decodeDecompressedOffset(long inputStreamCheckpoint) - { - // low order bits contain the decompressed offset, so a simple cast here will suffice - return (int) inputStreamCheckpoint; - } - - public static String inputStreamCheckpointToString(long inputStreamCheckpoint) - { - return MoreObjects.toStringHelper(InputStreamCheckpoint.class) - .add("decompressedOffset", decodeDecompressedOffset(inputStreamCheckpoint)) - .add("compressedBlockOffset", decodeCompressedBlockOffset(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java deleted file mode 100644 index bb08edd940..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamDwrfCheckpoint.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class LongStreamDwrfCheckpoint - implements LongStreamCheckpoint -{ - private final long inputStreamCheckpoint; - - public LongStreamDwrfCheckpoint(long inputStreamCheckpoint) - { - this.inputStreamCheckpoint = inputStreamCheckpoint; - } - - public LongStreamDwrfCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java deleted file mode 100644 index 410f181d38..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV1Checkpoint.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public class LongStreamV1Checkpoint - implements LongStreamCheckpoint -{ - private final int offset; - private final long inputStreamCheckpoint; - - public LongStreamV1Checkpoint(int offset, long inputStreamCheckpoint) - { - this.offset = offset; - this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); - } - - public LongStreamV1Checkpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - offset = positionsList.nextPosition(); - } - - public int getOffset() - { - return offset; - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("offset", offset) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java deleted file mode 100644 index 352c4d1bc1..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/LongStreamV2Checkpoint.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.createInputStreamCheckpoint; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class LongStreamV2Checkpoint - implements LongStreamCheckpoint -{ - private final int offset; - private final long inputStreamCheckpoint; - - public LongStreamV2Checkpoint(int offset, long inputStreamCheckpoint) - { - this.offset = offset; - this.inputStreamCheckpoint = checkNotNull(inputStreamCheckpoint, "inputStreamCheckpoint is null"); - } - - public LongStreamV2Checkpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - inputStreamCheckpoint = createInputStreamCheckpoint(compressionKind, positionsList); - offset = positionsList.nextPosition(); - } - - public int getOffset() - { - return offset; - } - - public long getInputStreamCheckpoint() - { - return inputStreamCheckpoint; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("offset", offset) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(inputStreamCheckpoint)) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java deleted file mode 100644 index 88ac0515e5..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/checkpoint/RowGroupDictionaryLengthStreamCheckpoint.java +++ /dev/null @@ -1,53 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.checkpoint; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.Checkpoints.ColumnPositionsList; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; - -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.inputStreamCheckpointToString; - -public final class RowGroupDictionaryLengthStreamCheckpoint - extends LongStreamV1Checkpoint -{ - private final int rowGroupDictionarySize; - - public RowGroupDictionaryLengthStreamCheckpoint(int rowGroupDictionarySize, int offset, long inputStreamCheckpoint) - { - super(offset, inputStreamCheckpoint); - this.rowGroupDictionarySize = rowGroupDictionarySize; - } - - public RowGroupDictionaryLengthStreamCheckpoint(CompressionKind compressionKind, ColumnPositionsList positionsList) - { - super(compressionKind, positionsList); - rowGroupDictionarySize = positionsList.nextPosition(); - } - - public int getRowGroupDictionarySize() - { - return rowGroupDictionarySize; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("rowGroupDictionarySize", rowGroupDictionarySize) - .add("offset", getOffset()) - .add("inputStreamCheckpoint", inputStreamCheckpointToString(getInputStreamCheckpoint())) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java deleted file mode 100644 index 65182d49bd..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/BooleanJsonReader.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class BooleanJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private BooleanStream dataStream; - - public BooleanJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - generator.writeBoolean(dataStream.nextBit()); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - return String.valueOf(dataStream.nextBit()); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null values - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, BooleanStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java deleted file mode 100644 index d1008528a1..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ByteJsonReader.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.ByteStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class ByteJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private ByteStream dataStream; - - public ByteJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - generator.writeNumber(dataStream.next()); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - return String.valueOf(dataStream.next()); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null values - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java deleted file mode 100644 index 3243ead772..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DateJsonReader.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; -import java.util.concurrent.TimeUnit; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class DateJsonReader - implements JsonMapKeyReader -{ - private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1); - - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private LongStream dataStream; - - public DateJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - long millis = dataStream.next() * MILLIS_IN_DAY; - generator.writeNumber(millis); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - long millis = dataStream.next() * MILLIS_IN_DAY; - return String.valueOf(millis); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null values - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java deleted file mode 100644 index 1adf00aeec..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/DoubleJsonReader.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.DoubleStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class DoubleJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private DoubleStream dataStream; - - public DoubleJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - double value = dataStream.next(); - generator.writeNumber(value); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - double value = dataStream.next(); - return String.valueOf(value); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null values - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, DoubleStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java deleted file mode 100644 index 0b4f668dff..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/FloatJsonReader.java +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.FloatStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class FloatJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private FloatStream dataStream; - - public FloatJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // write value as a double to avoid strange rounding errors - double value = dataStream.next(); - generator.writeNumber(value); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // write value as a double to avoid strange rounding errors - double value = dataStream.next(); - return String.valueOf(value); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null values - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, FloatStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java deleted file mode 100644 index 6e93f8abb2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonMapKeyReader.java +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import java.io.IOException; - -public interface JsonMapKeyReader - extends JsonReader -{ - String nextValueAsMapKey() - throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java deleted file mode 100644 index f35cbe6d82..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReader.java +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.io.IOException; -import java.util.List; - -public interface JsonReader -{ - void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException; - - void openRowGroup(StreamSources dataStreamSources) - throws IOException; - - void readNextValueInto(JsonGenerator generator) - throws IOException; - - void skip(int skipSize) - throws IOException; -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java deleted file mode 100644 index 06019757d2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/JsonReaders.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.joda.time.DateTimeZone; - -public final class JsonReaders -{ - private JsonReaders() - { - } - - public static JsonMapKeyReader createJsonMapKeyReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) - { - switch (streamDescriptor.getStreamType()) { - case BOOLEAN: - return new BooleanJsonReader(streamDescriptor); - case BYTE: - return new ByteJsonReader(streamDescriptor); - case SHORT: - case INT: - case LONG: - return new LongJsonReader(streamDescriptor); - case FLOAT: - return new FloatJsonReader(streamDescriptor); - case DOUBLE: - return new DoubleJsonReader(streamDescriptor); - case BINARY: - return new SliceJsonReader(streamDescriptor, true); - case STRING: - return new SliceJsonReader(streamDescriptor, false); - case TIMESTAMP: - return new TimestampJsonReader(streamDescriptor, hiveStorageTimeZone); - case DATE: - return new DateJsonReader(streamDescriptor); - case STRUCT: - case LIST: - case MAP: - case UNION: - case DECIMAL: - case VARCHAR: - case CHAR: - default: - throw new IllegalArgumentException("Unsupported map key type: " + streamDescriptor.getStreamType()); - } - } - - public static JsonReader createJsonReader( - StreamDescriptor streamDescriptor, - boolean checkForNulls, - DateTimeZone hiveStorageTimeZone) - { - switch (streamDescriptor.getStreamType()) { - case BOOLEAN: - return new BooleanJsonReader(streamDescriptor); - case BYTE: - return new ByteJsonReader(streamDescriptor); - case SHORT: - case INT: - case LONG: - return new LongJsonReader(streamDescriptor); - case FLOAT: - return new FloatJsonReader(streamDescriptor); - case DOUBLE: - return new DoubleJsonReader(streamDescriptor); - case BINARY: - return new SliceJsonReader(streamDescriptor, true); - case STRING: - return new SliceJsonReader(streamDescriptor, false); - case TIMESTAMP: - return new TimestampJsonReader(streamDescriptor, hiveStorageTimeZone); - case DATE: - return new DateJsonReader(streamDescriptor); - case STRUCT: - return new StructJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); - case LIST: - return new ListJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); - case MAP: - return new MapJsonReader(streamDescriptor, checkForNulls, hiveStorageTimeZone); - case UNION: - case DECIMAL: - case VARCHAR: - case CHAR: - default: - throw new IllegalArgumentException("Unsupported type: " + streamDescriptor.getStreamType()); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java deleted file mode 100644 index d6302fb8b5..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/ListJsonReader.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.LENGTH; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class ListJsonReader - implements JsonReader -{ - private final StreamDescriptor streamDescriptor; - private final boolean checkForNulls; - - private final JsonReader elementReader; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private LongStream lengthStream; - - public ListJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.checkForNulls = checkForNulls; - - elementReader = createJsonReader(streamDescriptor.getNestedStreams().get(0), true, hiveStorageTimeZone); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - long length = lengthStream.next(); - generator.writeStartArray(); - for (int i = 0; i < length; i++) { - elementReader.readNextValueInto(generator); - } - generator.writeEndArray(); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - long elementSkipSize = lengthStream.sum(skipSize); - elementReader.skip(Ints.checkedCast(elementSkipSize)); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - lengthStream = null; - - elementReader.openStripe(dictionaryStreamSources, encoding); - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - if (checkForNulls) { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - } - lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); - - elementReader.openRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java deleted file mode 100644 index b26fc9ab5b..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDictionaryJsonReader.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; - -public class LongDictionaryJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - @Nullable - private BooleanStream inDictionaryStream; - @Nullable - private LongStream dataStream; - - @Nonnull - private long[] dictionary = new long[0]; - - public LongDictionaryJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - generator.writeNumber(nextValue()); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - return String.valueOf(nextValue()); - } - - private long nextValue() - throws IOException - { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - long value = dataStream.next(); - if (inDictionaryStream == null || inDictionaryStream.nextBit()) { - value = dictionary[((int) value)]; - } - return value; - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - // skip non-null values - if (inDictionaryStream != null) { - inDictionaryStream.skip(skipSize); - } - if (skipSize > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(skipSize); - } - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - int dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); - if (dictionarySize > 0) { - if (dictionary.length < dictionarySize) { - dictionary = new long[dictionarySize]; - } - - LongStream dictionaryStream = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, LongStream.class).openStream(); - verifyFormat(dictionaryStream != null, "Dictionary is not empty but data stream is not present"); - dictionaryStream.nextLongVector(dictionarySize, dictionary); - } - - presentStream = null; - inDictionaryStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - inDictionaryStream = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java deleted file mode 100644 index b6edb82db2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongDirectJsonReader.java +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class LongDirectJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - @Nullable - private BooleanStream presentStream; - @Nullable - private LongStream dataStream; - - public LongDirectJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - generator.writeNumber(dataStream.next()); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - return String.valueOf(dataStream.next()); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - // skip non-null values - if (skipSize > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(skipSize); - } - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java deleted file mode 100644 index 4793a11280..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/LongJsonReader.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; - -public class LongJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - private final LongDirectJsonReader directReader; - - private final LongDictionaryJsonReader dictionaryReader; - private JsonMapKeyReader currentReader; - - public LongJsonReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - directReader = new LongDirectJsonReader(streamDescriptor); - dictionaryReader = new LongDictionaryJsonReader(streamDescriptor); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - currentReader.readNextValueInto(generator); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - return currentReader.nextValueAsMapKey(); - } - - @Override - public void skip(int skipSize) - throws IOException - { - currentReader.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - ColumnEncodingKind kind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); - if (kind == DIRECT || kind == DIRECT_V2 || kind == DWRF_DIRECT) { - currentReader = directReader; - } - else if (kind == DICTIONARY || kind == DICTIONARY_V2) { - currentReader = dictionaryReader; - } - else { - throw new IllegalArgumentException("Unsupported encoding " + kind); - } - - currentReader.openStripe(dictionaryStreamSources, encoding); - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - currentReader.openRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java deleted file mode 100644 index 5b6b73b055..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/MapJsonReader.java +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonMapKeyReader; -import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.LENGTH; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class MapJsonReader - implements JsonReader -{ - private final StreamDescriptor streamDescriptor; - private final boolean checkForNulls; - - private final JsonMapKeyReader keyReader; - private final JsonReader valueReader; - - @Nullable - private BooleanStream presentStream; - @Nullable - private LongStream lengthStream; - - public MapJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.checkForNulls = checkForNulls; - - keyReader = createJsonMapKeyReader(streamDescriptor.getNestedStreams().get(0), hiveStorageTimeZone); - valueReader = createJsonReader(streamDescriptor.getNestedStreams().get(1), true, hiveStorageTimeZone); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - long length = lengthStream.next(); - generator.writeStartObject(); - for (int i = 0; i < length; i++) { - String name = keyReader.nextValueAsMapKey(); - if (name == null) { - valueReader.skip(1); - } - else { - generator.writeFieldName(name); - valueReader.readNextValueInto(generator); - } - } - generator.writeEndObject(); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - // skip non-null values - long elementSkipSize = lengthStream.sum(skipSize); - keyReader.skip(Ints.checkedCast(elementSkipSize)); - valueReader.skip(Ints.checkedCast(elementSkipSize)); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - lengthStream = null; - - keyReader.openStripe(dictionaryStreamSources, encoding); - valueReader.openStripe(dictionaryStreamSources, encoding); - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - if (checkForNulls) { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - } - lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); - - keyReader.openRowGroup(dataStreamSources); - valueReader.openRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java deleted file mode 100644 index bf7cb6fc13..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDictionaryJsonReader.java +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import com.google.common.io.BaseEncoding; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.*; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static java.nio.charset.StandardCharsets.UTF_8; - -public class SliceDictionaryJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - private final boolean writeBinary; - - @Nonnull - private DictionaryEntry[] dictionary = new DictionaryEntry[0]; - - @Nonnull - private int[] dictionaryLength = new int[0]; - - @Nonnull - private DictionaryEntry[] rowGroupDictionary = new DictionaryEntry[0]; - - @Nonnull - private int[] rowGroupDictionaryLength = new int[0]; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private BooleanStream inDictionaryStream; - - @Nullable - private LongStream dataStream; - - public SliceDictionaryJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.writeBinary = writeBinary; - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - DictionaryEntry value = getNextValue(); - - byte[] data = value.getData(); - int offset = value.getOffset(); - int length = value.length(); - if (writeBinary) { - generator.writeBinary(data, offset, length); - } - else { - generator.writeUTF8String(data, offset, length); - } - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - DictionaryEntry value = getNextValue(); - - byte[] data = value.getData(); - int offset = value.getOffset(); - int length = value.length(); - if (writeBinary) { - return BaseEncoding.base64().encode(data, offset, length); - } - else { - return new String(data, offset, length, UTF_8); - } - } - - private DictionaryEntry getNextValue() - throws IOException - { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - int dictionaryIndex = Ints.checkedCast(dataStream.next()); - - DictionaryEntry value; - if (inDictionaryStream == null || inDictionaryStream.nextBit()) { - value = dictionary[dictionaryIndex]; - } - else { - value = rowGroupDictionary[dictionaryIndex]; - } - return value; - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - - // skip non-null length - if (inDictionaryStream != null) { - inDictionaryStream.skip(skipSize); - } - dataStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - int dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); - if (dictionarySize > 0) { - // resize the dictionary array if necessary - if (dictionary.length < dictionarySize) { - dictionary = new DictionaryEntry[dictionarySize]; - dictionaryLength = new int[dictionarySize]; - } - - LongStream lengthStream = dictionaryStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); - verifyFormat(lengthStream != null, "Dictionary is not empty but length stream is not present"); - lengthStream.nextIntVector(dictionarySize, dictionaryLength); - - ByteArrayStream dictionaryDataStream = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, ByteArrayStream.class).openStream(); - readDictionary(dictionaryDataStream, dictionarySize, dictionaryLength, dictionary); - } - - presentStream = null; - dataStream = null; - inDictionaryStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - RowGroupDictionaryLengthStream lengthStream = dataStreamSources.getStreamSource( - streamDescriptor, - ROW_GROUP_DICTIONARY_LENGTH, - RowGroupDictionaryLengthStream.class).openStream(); - - if (lengthStream == null) { - inDictionaryStream = null; - } - else { - inDictionaryStream = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class).openStream(); - - int dictionaryEntryCount = lengthStream.getEntryCount(); - - // resize the dictionary array if necessary - if (rowGroupDictionary.length < dictionaryEntryCount) { - rowGroupDictionary = new DictionaryEntry[dictionaryEntryCount]; - rowGroupDictionaryLength = new int[dictionaryEntryCount]; - } - - // read the lengths - lengthStream.nextIntVector(dictionaryEntryCount, rowGroupDictionaryLength); - - ByteArrayStream dictionaryDataStream = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY, ByteArrayStream.class).openStream(); - readDictionary(dictionaryDataStream, dictionaryEntryCount, rowGroupDictionaryLength, rowGroupDictionary); - } - - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); - } - - private static void readDictionary(ByteArrayStream dictionaryDataStream, int dictionarySize, int[] dictionaryLength, DictionaryEntry[] dictionary) - throws IOException - { - // sum lengths - int totalLength = 0; - for (int i = 0; i < dictionarySize; i++) { - totalLength += dictionaryLength[i]; - } - - // read dictionary data - byte[] dictionaryData = new byte[0]; - if (totalLength > 0) { - verifyFormat(dictionaryDataStream != null, "Dictionary length is not zero but dictionary data stream is not present"); - dictionaryData = dictionaryDataStream.next(totalLength); - } - - // build dictionary slices - int offset = 0; - for (int i = 0; i < dictionarySize; i++) { - int length = dictionaryLength[i]; - dictionary[i] = new DictionaryEntry(dictionaryData, offset, length); - offset += length; - } - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } - - private static class DictionaryEntry - { - private final byte[] dictionary; - private final int offset; - private final int length; - - public DictionaryEntry(byte[] dictionary, int offset, int length) - { - this.dictionary = dictionary; - this.offset = offset; - this.length = length; - } - - public int length() - { - return length; - } - - public byte[] getData() - { - return dictionary; - } - - public int getOffset() - { - return offset; - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java deleted file mode 100644 index 6f6630c59e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceDirectJsonReader.java +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import com.google.common.io.BaseEncoding; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.ByteArrayStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static java.nio.charset.StandardCharsets.UTF_8; - -public class SliceDirectJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - private final boolean writeBinary; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private LongStream lengthStream; - - @Nullable - private ByteArrayStream dataStream; - - @Nonnull - private byte[] data = new byte[1024]; - - public SliceDirectJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.writeBinary = writeBinary; - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - int length = bufferNextValue(); - - if (writeBinary) { - generator.writeBinary(data, 0, length); - } - else { - generator.writeUTF8String(data, 0, length); - } - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - int length = bufferNextValue(); - - if (writeBinary) { - return BaseEncoding.base64().encode(data, 0, length); - } - else { - return new String(data, 0, length, UTF_8); - } - } - - private int bufferNextValue() - throws IOException - { - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - int length = Ints.checkedCast(lengthStream.next()); - if (data.length < length) { - data = new byte[length]; - } - - if (length > 0) { - verifyFormat(dataStream != null, "Length is not zero but data stream is not present"); - dataStream.next(length, data); - } - return length; - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - - // skip non-null length - long dataSkipSize = lengthStream.sum(skipSize); - - if (dataSkipSize == 0) { - return; - } - - verifyFormat(dataStream != null, "Length is not zero but data stream is not present"); - - // skip data bytes - dataStream.skip(Ints.checkedCast(dataSkipSize)); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - lengthStream = null; - dataStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - lengthStream = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class).openStream(); - dataStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteArrayStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java deleted file mode 100644 index 68892ca244..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/SliceJsonReader.java +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; - -public class SliceJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - private final SliceDirectJsonReader directReader; - private final SliceDictionaryJsonReader dictionaryReader; - private JsonMapKeyReader currentReader; - - public SliceJsonReader(StreamDescriptor streamDescriptor, boolean writeBinary) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - directReader = new SliceDirectJsonReader(streamDescriptor, writeBinary); - dictionaryReader = new SliceDictionaryJsonReader(streamDescriptor, writeBinary); - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - currentReader.readNextValueInto(generator); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - return currentReader.nextValueAsMapKey(); - } - - @Override - public void skip(int skipSize) - throws IOException - { - currentReader.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, - List encoding) - throws IOException - { - ColumnEncodingKind columnEncodingKind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); - if (columnEncodingKind == DIRECT || columnEncodingKind == DIRECT_V2 || columnEncodingKind == ColumnEncodingKind.DWRF_DIRECT) { - currentReader = directReader; - } - else if (columnEncodingKind == DICTIONARY || columnEncodingKind == DICTIONARY_V2) { - currentReader = dictionaryReader; - } - else { - throw new IllegalArgumentException("Unsupported encoding " + columnEncodingKind); - } - - currentReader.openStripe(dictionaryStreamSources, encoding); - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - currentReader.openRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java deleted file mode 100644 index 600b7b778d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/StructJsonReader.java +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; - -public class StructJsonReader - implements JsonReader -{ - private final StreamDescriptor streamDescriptor; - private final boolean checkForNulls; - private final JsonReader[] structFields; - - @Nullable - private BooleanStream presentStream; - - public StructJsonReader(StreamDescriptor streamDescriptor, boolean checkForNulls, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.checkForNulls = checkForNulls; - - List nestedStreams = streamDescriptor.getNestedStreams(); - this.structFields = new JsonReader[nestedStreams.size()]; - for (int i = 0; i < nestedStreams.size(); i++) { - StreamDescriptor nestedStream = nestedStreams.get(i); - this.structFields[i] = createJsonReader(nestedStream, true, hiveStorageTimeZone); - } - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - generator.writeStartArray(); - for (JsonReader structField : structFields) { - structField.readNextValueInto(generator); - } - generator.writeEndArray(); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - // skip non-null values - for (JsonReader structField : structFields) { - structField.skip(skipSize); - } - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - - for (JsonReader structField : structFields) { - structField.openStripe(dictionaryStreamSources, encoding); - } - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - if (checkForNulls) { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - } - - for (JsonReader structField : structFields) { - structField.openRowGroup(dataStreamSources); - } - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java deleted file mode 100644 index bfebf78658..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/json/TimestampJsonReader.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.json; - -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.reader.TimestampStreamReader.decodeTimestamp; - -public class TimestampJsonReader - implements JsonMapKeyReader -{ - private final StreamDescriptor streamDescriptor; - - private final long baseTimestampInSeconds; - - @Nullable - private BooleanStream presentStream; - - @Nullable - private LongStream secondsStream; - - @Nullable - private LongStream nanosStream; - - public TimestampJsonReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.baseTimestampInSeconds = new DateTime(2015, 1, 1, 0, 0, checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null")).getMillis() / 1000; - } - - @Override - public void readNextValueInto(JsonGenerator generator) - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - generator.writeNull(); - return; - } - - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - long timestamp = decodeTimestamp(secondsStream.next(), nanosStream.next(), baseTimestampInSeconds); - generator.writeNumber(timestamp); - } - - @Override - public String nextValueAsMapKey() - throws IOException - { - if (presentStream != null && !presentStream.nextBit()) { - return null; - } - - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - long timestamp = decodeTimestamp(secondsStream.next(), nanosStream.next(), baseTimestampInSeconds); - return String.valueOf(timestamp); - } - - @Override - public void skip(int skipSize) - throws IOException - { - // skip nulls - if (presentStream != null) { - skipSize = presentStream.countBitsSet(skipSize); - } - - if (skipSize == 0) { - return; - } - - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - // skip non-null values - secondsStream.skip(skipSize); - nanosStream.skip(skipSize); - } - - @Override - public void openStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStream = null; - secondsStream = null; - nanosStream = null; - } - - @Override - public void openRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStream = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class).openStream(); - secondsStream = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class).openStream(); - nanosStream = dataStreamSources.getStreamSource(streamDescriptor, SECONDARY, LongStream.class).openStream(); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java deleted file mode 100644 index 20ae97058e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/DwrfMetadataReader.java +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import com.facebook.hive.orc.OrcProto; -import com.facebook.hive.orc.OrcProto.ColumnEncoding.Kind; -import com.google.common.base.Function; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Iterables; -import com.google.common.primitives.Ints; -import com.google.protobuf.CodedInputStream; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkState; -import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; - -public class DwrfMetadataReader - implements MetadataReader -{ - @Override - public PostScript readPostScript(byte[] data, int offset, int length) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(data, offset, length); - OrcProto.PostScript postScript = OrcProto.PostScript.parseFrom(input); - - return new PostScript( - ImmutableList.of(), - postScript.getFooterLength(), - 0, - toCompression(postScript.getCompression()), - postScript.getCompressionBlockSize()); - } - - @Override - public Metadata readMetadata(InputStream inputStream) - throws IOException - { - return new Metadata(ImmutableList.of()); - } - - @Override - public Footer readFooter(InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.Footer footer = OrcProto.Footer.parseFrom(input); - return new Footer( - footer.getNumberOfRows(), - footer.getRowIndexStride(), - toStripeInformation(footer.getStripesList()), - toType(footer.getTypesList()), - toColumnStatistics(footer.getStatisticsList(), false)); - } - - private static List toStripeInformation(List types) - { - return ImmutableList.copyOf(Iterables.transform(types, new Function() - { - @Override - public StripeInformation apply(OrcProto.StripeInformation type) - { - return toStripeInformation(type); - } - })); - } - - private static StripeInformation toStripeInformation(OrcProto.StripeInformation stripeInformation) - { - return new StripeInformation( - Ints.checkedCast(stripeInformation.getNumberOfRows()), - stripeInformation.getOffset(), - stripeInformation.getIndexLength(), - stripeInformation.getDataLength(), - stripeInformation.getFooterLength()); - } - - @Override - public StripeFooter readStripeFooter(List types, InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.StripeFooter stripeFooter = OrcProto.StripeFooter.parseFrom(input); - return new StripeFooter(toStream(stripeFooter.getStreamsList()), toColumnEncoding(types, stripeFooter.getColumnsList())); - } - - private static Stream toStream(OrcProto.Stream stream) - { - return new Stream(stream.getColumn(), toStreamKind(stream.getKind()), Ints.checkedCast(stream.getLength()), stream.getUseVInts()); - } - - private static List toStream(List streams) - { - return ImmutableList.copyOf(Iterables.transform(streams, new Function() - { - @Override - public Stream apply(OrcProto.Stream stream) - { - return toStream(stream); - } - })); - } - - private static ColumnEncoding toColumnEncoding(OrcTypeKind type, OrcProto.ColumnEncoding columnEncoding) - { - return new ColumnEncoding(toColumnEncodingKind(type, columnEncoding.getKind()), columnEncoding.getDictionarySize()); - } - - private static List toColumnEncoding(List types, List columnEncodings) - { - checkArgument(types.size() == columnEncodings.size()); - - ImmutableList.Builder encodings = ImmutableList.builder(); - for (int i = 0; i < types.size(); i++) { - OrcType type = types.get(i); - encodings.add(toColumnEncoding(type.getOrcTypeKind(), columnEncodings.get(i))); - } - return encodings.build(); - } - - @Override - public List readRowIndexes(InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.RowIndex rowIndex = OrcProto.RowIndex.parseFrom(input); - return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() - { - @Override - public RowGroupIndex apply(OrcProto.RowIndexEntry rowIndexEntry) - { - return toRowGroupIndex(rowIndexEntry); - } - })); - } - - private static RowGroupIndex toRowGroupIndex(OrcProto.RowIndexEntry rowIndexEntry) - { - List positionsList = rowIndexEntry.getPositionsList(); - ImmutableList.Builder positions = ImmutableList.builder(); - for (int index = 0; index < positionsList.size(); index++) { - long longPosition = positionsList.get(index); - int intPosition = (int) longPosition; - - checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index); - - positions.add(intPosition); - } - return new RowGroupIndex(positions.build(), toColumnStatistics(rowIndexEntry.getStatistics(), true)); - } - - private static List toColumnStatistics(List columnStatistics, final boolean isRowGroup) - { - if (columnStatistics == null) { - return ImmutableList.of(); - } - return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() - { - @Override - public ColumnStatistics apply(OrcProto.ColumnStatistics columnStatistics) - { - return toColumnStatistics(columnStatistics, isRowGroup); - } - })); - } - - private static ColumnStatistics toColumnStatistics(OrcProto.ColumnStatistics statistics, boolean isRowGroup) - { - return new ColumnStatistics( - statistics.getNumberOfValues(), - toBooleanStatistics(statistics.getBucketStatistics()), - toIntegerStatistics(statistics.getIntStatistics()), - toDoubleStatistics(statistics.getDoubleStatistics()), - toStringStatistics(statistics.getStringStatistics(), isRowGroup), - null); - } - - private static BooleanStatistics toBooleanStatistics(OrcProto.BucketStatistics bucketStatistics) - { - if (bucketStatistics.getCountCount() == 0) { - return null; - } - - return new BooleanStatistics(bucketStatistics.getCount(0)); - } - - private static IntegerStatistics toIntegerStatistics(OrcProto.IntegerStatistics integerStatistics) - { - if (!integerStatistics.hasMinimum() && !integerStatistics.hasMaximum()) { - return null; - } - - return new IntegerStatistics( - integerStatistics.hasMinimum() ? integerStatistics.getMinimum() : null, - integerStatistics.hasMaximum() ? integerStatistics.getMaximum() : null); - } - - private static DoubleStatistics toDoubleStatistics(OrcProto.DoubleStatistics doubleStatistics) - { - if (!doubleStatistics.hasMinimum() && !doubleStatistics.hasMaximum()) { - return null; - } - - // TODO remove this when double statistics are changed to correctly deal with NaNs - // if either min or max is NaN, ignore the stat - if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) || - (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum()))) { - return null; - } - - return new DoubleStatistics( - doubleStatistics.hasMinimum() ? doubleStatistics.getMinimum() : null, - doubleStatistics.hasMaximum() ? doubleStatistics.getMaximum() : null); - } - - private static StringStatistics toStringStatistics(OrcProto.StringStatistics stringStatistics, boolean isRowGroup) - { - // TODO remove this when string statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 - if (!isRowGroup) { - return null; - } - - if (!stringStatistics.hasMinimum() && !stringStatistics.hasMaximum()) { - return null; - } - - // temporarily disable string statistics until we figure out the implications of how UTF-16 - // strings are compared when they contain surrogate pairs and replacement characters - if (true) { - return null; - } - - return new StringStatistics( - stringStatistics.hasMinimum() ? stringStatistics.getMinimum() : null, - stringStatistics.hasMaximum() ? stringStatistics.getMaximum() : null); - } - - private static OrcType toType(OrcProto.Type type) - { - return new OrcType(toTypeKind(type.getKind()), type.getSubtypesList(), type.getFieldNamesList()); - } - - private static List toType(List types) - { - return ImmutableList.copyOf(Iterables.transform(types, new Function() - { - @Override - public OrcType apply(OrcProto.Type type) - { - return toType(type); - } - })); - } - - private static OrcTypeKind toTypeKind(OrcProto.Type.Kind kind) - { - switch (kind) { - case BOOLEAN: - return OrcTypeKind.BOOLEAN; - case BYTE: - return OrcTypeKind.BYTE; - case SHORT: - return OrcTypeKind.SHORT; - case INT: - return OrcTypeKind.INT; - case LONG: - return OrcTypeKind.LONG; - case FLOAT: - return OrcTypeKind.FLOAT; - case DOUBLE: - return OrcTypeKind.DOUBLE; - case STRING: - return OrcTypeKind.STRING; - case BINARY: - return OrcTypeKind.BINARY; - case TIMESTAMP: - return OrcTypeKind.TIMESTAMP; - case LIST: - return OrcTypeKind.LIST; - case MAP: - return OrcTypeKind.MAP; - case STRUCT: - return OrcTypeKind.STRUCT; - case UNION: - return OrcTypeKind.UNION; - default: - throw new IllegalArgumentException(kind + " data type not implemented yet"); - } - } - - private static StreamKind toStreamKind(OrcProto.Stream.Kind kind) - { - switch (kind) { - case PRESENT: - return StreamKind.PRESENT; - case DATA: - return StreamKind.DATA; - case LENGTH: - return StreamKind.LENGTH; - case DICTIONARY_DATA: - return StreamKind.DICTIONARY_DATA; - case DICTIONARY_COUNT: - return StreamKind.DICTIONARY_COUNT; - case NANO_DATA: - return StreamKind.SECONDARY; - case ROW_INDEX: - return StreamKind.ROW_INDEX; - case IN_DICTIONARY: - return StreamKind.IN_DICTIONARY; - case STRIDE_DICTIONARY: - return StreamKind.ROW_GROUP_DICTIONARY; - case STRIDE_DICTIONARY_LENGTH: - return StreamKind.ROW_GROUP_DICTIONARY_LENGTH; - default: - throw new IllegalArgumentException(kind + " stream type not implemented yet"); - } - } - - private static ColumnEncodingKind toColumnEncodingKind(OrcTypeKind type, Kind kind) - { - switch (kind) { - case DIRECT: - if (type == OrcTypeKind.SHORT || type == OrcTypeKind.INT || type == OrcTypeKind.LONG) { - return ColumnEncodingKind.DWRF_DIRECT; - } - else { - return ColumnEncodingKind.DIRECT; - } - case DICTIONARY: - return ColumnEncodingKind.DICTIONARY; - default: - throw new IllegalArgumentException(kind + " stream encoding not implemented yet"); - } - } - - private static CompressionKind toCompression(OrcProto.CompressionKind compression) - { - switch (compression) { - case NONE: - return UNCOMPRESSED; - case ZLIB: - return ZLIB; - case SNAPPY: - return SNAPPY; - default: - throw new IllegalArgumentException(compression + " compression not implemented yet"); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java deleted file mode 100644 index 38bae8b8f2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/OrcMetadataReader.java +++ /dev/null @@ -1,402 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -import com.facebook.presto.hive.shaded.com.google.protobuf.CodedInputStream; -import com.google.common.base.Function; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Iterables; -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; -import org.apache.hadoop.hive.ql.io.orc.OrcProto; -import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; - -import static com.google.common.base.Preconditions.checkState; -import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; - -public class OrcMetadataReader - implements MetadataReader -{ - @Override - public PostScript readPostScript(byte[] data, int offset, int length) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(data, offset, length); - OrcProto.PostScript postScript = OrcProto.PostScript.parseFrom(input); - - return new PostScript( - postScript.getVersionList(), - postScript.getFooterLength(), - postScript.getMetadataLength(), - toCompression(postScript.getCompression()), - postScript.getCompressionBlockSize()); - } - - @Override - public Metadata readMetadata(InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.Metadata metadata = OrcProto.Metadata.parseFrom(input); - return new Metadata(toStripeStatistics(metadata.getStripeStatsList())); - } - - private static List toStripeStatistics(List types) - { - return ImmutableList.copyOf(Iterables.transform(types, new Function() - { - @Override - public StripeStatistics apply(OrcProto.StripeStatistics type) - { - return toStripeStatistics(type); - } - })); - } - - private static StripeStatistics toStripeStatistics(OrcProto.StripeStatistics stripeStatistics) - { - return new StripeStatistics(toColumnStatistics(stripeStatistics.getColStatsList(), false)); - } - - @Override - public Footer readFooter(InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.Footer footer = OrcProto.Footer.parseFrom(input); - return new Footer( - footer.getNumberOfRows(), - footer.getRowIndexStride(), - toStripeInformation(footer.getStripesList()), - toType(footer.getTypesList()), - toColumnStatistics(footer.getStatisticsList(), false)); - } - - private static List toStripeInformation(List types) - { - return ImmutableList.copyOf(Iterables.transform(types, new Function() - { - @Override - public StripeInformation apply(OrcProto.StripeInformation type) - { - return toStripeInformation(type); - } - })); - } - - private static StripeInformation toStripeInformation(OrcProto.StripeInformation stripeInformation) - { - return new StripeInformation( - Ints.checkedCast(stripeInformation.getNumberOfRows()), - stripeInformation.getOffset(), - stripeInformation.getIndexLength(), - stripeInformation.getDataLength(), - stripeInformation.getFooterLength()); - } - - @Override - public StripeFooter readStripeFooter(List types, InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.StripeFooter stripeFooter = OrcProto.StripeFooter.parseFrom(input); - return new StripeFooter(toStream(stripeFooter.getStreamsList()), toColumnEncoding(stripeFooter.getColumnsList())); - } - - private static Stream toStream(OrcProto.Stream stream) - { - return new Stream(stream.getColumn(), toStreamKind(stream.getKind()), Ints.checkedCast(stream.getLength()), true); - } - - private static List toStream(List streams) - { - return ImmutableList.copyOf(Iterables.transform(streams, new Function() - { - @Override - public Stream apply(OrcProto.Stream stream) - { - return toStream(stream); - } - })); - } - - private static ColumnEncoding toColumnEncoding(OrcProto.ColumnEncoding columnEncoding) - { - return new ColumnEncoding(toColumnEncodingKind(columnEncoding.getKind()), columnEncoding.getDictionarySize()); - } - - private static List toColumnEncoding(List columnEncodings) - { - return ImmutableList.copyOf(Iterables.transform(columnEncodings, new Function() - { - @Override - public ColumnEncoding apply(OrcProto.ColumnEncoding columnEncoding) - { - return toColumnEncoding(columnEncoding); - } - })); - } - - @Override - public List readRowIndexes(InputStream inputStream) - throws IOException - { - CodedInputStream input = CodedInputStream.newInstance(inputStream); - OrcProto.RowIndex rowIndex = OrcProto.RowIndex.parseFrom(input); - return ImmutableList.copyOf(Iterables.transform(rowIndex.getEntryList(), new Function() - { - @Override - public RowGroupIndex apply(RowIndexEntry rowIndexEntry) - { - return toRowGroupIndex(rowIndexEntry); - } - })); - } - - private static RowGroupIndex toRowGroupIndex(RowIndexEntry rowIndexEntry) - { - List positionsList = rowIndexEntry.getPositionsList(); - ImmutableList.Builder positions = ImmutableList.builder(); - for (int index = 0; index < positionsList.size(); index++) { - long longPosition = positionsList.get(index); - int intPosition = (int) longPosition; - - checkState(intPosition == longPosition, "Expected checkpoint position %s, to be an integer", index); - - positions.add(intPosition); - } - return new RowGroupIndex(positions.build(), toColumnStatistics(rowIndexEntry.getStatistics(), true)); - } - - private static ColumnStatistics toColumnStatistics(OrcProto.ColumnStatistics statistics, boolean isRowGroup) - { - return new ColumnStatistics( - statistics.getNumberOfValues(), - toBooleanStatistics(statistics.getBucketStatistics()), - toIntegerStatistics(statistics.getIntStatistics()), - toDoubleStatistics(statistics.getDoubleStatistics()), - toStringStatistics(statistics.getStringStatistics(), isRowGroup), - toDateStatistics(statistics.getDateStatistics(), isRowGroup)); - } - - private static List toColumnStatistics(List columnStatistics, final boolean isRowGroup) - { - if (columnStatistics == null) { - return ImmutableList.of(); - } - return ImmutableList.copyOf(Iterables.transform(columnStatistics, new Function() - { - @Override - public ColumnStatistics apply(OrcProto.ColumnStatistics columnStatistics) - { - return toColumnStatistics(columnStatistics, isRowGroup); - } - })); - } - - private static BooleanStatistics toBooleanStatistics(OrcProto.BucketStatistics bucketStatistics) - { - if (bucketStatistics.getCountCount() == 0) { - return null; - } - - return new BooleanStatistics(bucketStatistics.getCount(0)); - } - - private static IntegerStatistics toIntegerStatistics(OrcProto.IntegerStatistics integerStatistics) - { - if (!integerStatistics.hasMinimum() && !integerStatistics.hasMaximum()) { - return null; - } - - return new IntegerStatistics( - integerStatistics.hasMinimum() ? integerStatistics.getMinimum() : null, - integerStatistics.hasMaximum() ? integerStatistics.getMaximum() : null); - } - - private static DoubleStatistics toDoubleStatistics(OrcProto.DoubleStatistics doubleStatistics) - { - if (!doubleStatistics.hasMinimum() && !doubleStatistics.hasMaximum()) { - return null; - } - - // TODO remove this when double statistics are changed to correctly deal with NaNs - // if either min or max is NaN, ignore the stat - if ((doubleStatistics.hasMinimum() && Double.isNaN(doubleStatistics.getMinimum())) || - (doubleStatistics.hasMaximum() && Double.isNaN(doubleStatistics.getMaximum()))) { - return null; - } - - return new DoubleStatistics( - doubleStatistics.hasMinimum() ? doubleStatistics.getMinimum() : null, - doubleStatistics.hasMaximum() ? doubleStatistics.getMaximum() : null); - } - - private static StringStatistics toStringStatistics(OrcProto.StringStatistics stringStatistics, boolean isRowGroup) - { - // TODO remove this when string statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 - if (!isRowGroup) { - return null; - } - - if (!stringStatistics.hasMinimum() && !stringStatistics.hasMaximum()) { - return null; - } - - return new StringStatistics( - stringStatistics.hasMinimum() ? stringStatistics.getMinimum() : null, - stringStatistics.hasMaximum() ? stringStatistics.getMaximum() : null); - } - - private static DateStatistics toDateStatistics(OrcProto.DateStatistics dateStatistics, boolean isRowGroup) - { - // TODO remove this when date statistics in ORC are fixed https://issues.apache.org/jira/browse/HIVE-8732 - if (!isRowGroup) { - return null; - } - - if (!dateStatistics.hasMinimum() && !dateStatistics.hasMaximum()) { - return null; - } - - // temporarily disable string statistics until we figure out the implications of how UTF-16 - // strings are compared when they contain surrogate pairs and replacement characters - if (true) { - return null; - } - - return new DateStatistics( - dateStatistics.hasMinimum() ? dateStatistics.getMinimum() : null, - dateStatistics.hasMaximum() ? dateStatistics.getMaximum() : null); - } - - private static OrcType toType(OrcProto.Type type) - { - return new OrcType(toTypeKind(type.getKind()), type.getSubtypesList(), type.getFieldNamesList()); - } - - private static List toType(List types) - { - return ImmutableList.copyOf(Iterables.transform(types, new Function() - { - @Override - public OrcType apply(OrcProto.Type type) - { - return toType(type); - } - })); - } - - private static OrcTypeKind toTypeKind(OrcProto.Type.Kind typeKind) - { - switch (typeKind) { - case BOOLEAN: - return OrcTypeKind.BOOLEAN; - case BYTE: - return OrcTypeKind.BYTE; - case SHORT: - return OrcTypeKind.SHORT; - case INT: - return OrcTypeKind.INT; - case LONG: - return OrcTypeKind.LONG; - case FLOAT: - return OrcTypeKind.FLOAT; - case DOUBLE: - return OrcTypeKind.DOUBLE; - case STRING: - return OrcTypeKind.STRING; - case BINARY: - return OrcTypeKind.BINARY; - case TIMESTAMP: - return OrcTypeKind.TIMESTAMP; - case LIST: - return OrcTypeKind.LIST; - case MAP: - return OrcTypeKind.MAP; - case STRUCT: - return OrcTypeKind.STRUCT; - case UNION: - return OrcTypeKind.UNION; - case DECIMAL: - return OrcTypeKind.DECIMAL; - case DATE: - return OrcTypeKind.DATE; - case VARCHAR: - return OrcTypeKind.VARCHAR; - case CHAR: - return OrcTypeKind.CHAR; - default: - throw new IllegalStateException(typeKind + " stream type not implemented yet"); - } - } - - private static StreamKind toStreamKind(OrcProto.Stream.Kind streamKind) - { - switch (streamKind) { - case PRESENT: - return StreamKind.PRESENT; - case DATA: - return StreamKind.DATA; - case LENGTH: - return StreamKind.LENGTH; - case DICTIONARY_DATA: - return StreamKind.DICTIONARY_DATA; - case DICTIONARY_COUNT: - return StreamKind.DICTIONARY_COUNT; - case SECONDARY: - return StreamKind.SECONDARY; - case ROW_INDEX: - return StreamKind.ROW_INDEX; - default: - throw new IllegalStateException(streamKind + " stream type not implemented yet"); - } - } - - private static ColumnEncodingKind toColumnEncodingKind(OrcProto.ColumnEncoding.Kind columnEncodingKind) - { - switch (columnEncodingKind) { - case DIRECT: - return ColumnEncodingKind.DIRECT; - case DIRECT_V2: - return ColumnEncodingKind.DIRECT_V2; - case DICTIONARY: - return ColumnEncodingKind.DICTIONARY; - case DICTIONARY_V2: - return ColumnEncodingKind.DICTIONARY_V2; - default: - throw new IllegalStateException(columnEncodingKind + " stream encoding not implemented yet"); - } - } - - private static CompressionKind toCompression(OrcProto.CompressionKind compression) - { - switch (compression) { - case NONE: - return UNCOMPRESSED; - case ZLIB: - return ZLIB; - case SNAPPY: - return SNAPPY; - default: - throw new IllegalStateException(compression + " compression not implemented yet"); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java deleted file mode 100644 index 17cb8ba289..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/metadata/StringStatistics.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.metadata; - -public class StringStatistics - implements RangeStatistics -{ - private final String minimum; - private final String maximum; - - public StringStatistics(String minimum, String maximum) - { - this.minimum = minimum; - this.maximum = maximum; - } - - @Override - public String getMin() - { - return minimum; - } - - @Override - public String getMax() - { - return maximum; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java deleted file mode 100644 index cb38b2ed6e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/BooleanStreamReader.java +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.BooleanVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class BooleanStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream dataStream; - - private boolean rowGroupOpen; - - public BooleanStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - BooleanVector booleanVector = (BooleanVector) vector; - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(booleanVector.isNull, false); - dataStream.getSetBits(nextBatchSize, booleanVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, booleanVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.getSetBits(nextBatchSize, booleanVector.vector, booleanVector.isNull); - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(BooleanStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, BooleanStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java deleted file mode 100644 index 3688d2fce2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/ByteStreamReader.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.ByteStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class ByteStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(ByteStream.class); - @Nullable - private ByteStream dataStream; - - private boolean rowGroupOpen; - - public ByteStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - LongVector byteVector = (LongVector) vector; - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(byteVector.isNull, false); - dataStream.nextVector(nextBatchSize, byteVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, byteVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextVector(nextBatchSize, byteVector.vector, byteVector.isNull); - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(ByteStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java deleted file mode 100644 index afca11996d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/DoubleStreamReader.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.DoubleVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.DoubleStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class DoubleStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(DoubleStream.class); - @Nullable - private DoubleStream dataStream; - - private boolean rowGroupOpen; - - public DoubleStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - DoubleVector doubleVector = (DoubleVector) vector; - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(doubleVector.isNull, false); - dataStream.nextVector(nextBatchSize, doubleVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, doubleVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextVector(nextBatchSize, doubleVector.vector, doubleVector.isNull); - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(DoubleStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, DoubleStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java deleted file mode 100644 index 8d75390337..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/FloatStreamReader.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.DoubleVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.FloatStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class FloatStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(FloatStream.class); - @Nullable - private FloatStream dataStream; - - private boolean rowGroupOpen; - - public FloatStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - // we could add a float vector but Presto currently doesn't support floats - DoubleVector floatVector = (DoubleVector) vector; - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(floatVector.isNull, false); - dataStream.nextVector(nextBatchSize, floatVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, floatVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextVector(nextBatchSize, floatVector.vector, floatVector.isNull); - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(FloatStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, FloatStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java deleted file mode 100644 index 8048e61335..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/JsonStreamReader.java +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; -import org.apache.tajo.storage.thirdparty.orc.SliceVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.json.JsonReader; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import io.airlift.slice.DynamicSliceOutput; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.json.JsonReaders.createJsonReader; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class JsonStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - private final JsonReader jsonReader; - - private boolean stripeOpen; - private boolean rowGroupOpen; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; - - private int readOffset; - private int nextBatchSize; - - @Nullable - private StreamSources dictionaryStreamSources; - @Nullable - private StreamSources dataStreamSources; - - private List encoding; - - public JsonStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.jsonReader = createJsonReader(streamDescriptor, false, hiveStorageTimeZone); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the length reader - readOffset = presentStream.countBitsSet(readOffset); - } - - jsonReader.skip(readOffset); - } - - SliceVector sliceVector = (SliceVector) vector; - if (presentStream != null) { - presentStream.getUnsetBits(nextBatchSize, isNullVector); - } - - DynamicSliceOutput out = new DynamicSliceOutput(1024); - for (int i = 0; i < nextBatchSize; i++) { - if (!isNullVector[i]) { - out.reset(); - JsonGenerator generator = new JsonFactory().createGenerator(out); - jsonReader.readNextValueInto(generator); - sliceVector.vector[i] = out.copySlice(); - } - else { - sliceVector.vector[i] = null; - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - - if (!stripeOpen) { - jsonReader.openStripe(dictionaryStreamSources, encoding); - } - - jsonReader.openRowGroup(dataStreamSources); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - this.dictionaryStreamSources = dictionaryStreamSources; - this.dataStreamSources = null; - this.encoding = encoding; - - presentStreamSource = missingStreamSource(BooleanStream.class); - - stripeOpen = false; - rowGroupOpen = false; - - readOffset = 0; - nextBatchSize = 0; - - Arrays.fill(isNullVector, false); - - presentStream = null; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - this.dataStreamSources = dataStreamSources; - - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - - rowGroupOpen = false; - - readOffset = 0; - nextBatchSize = 0; - - Arrays.fill(isNullVector, false); - - presentStream = null; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java deleted file mode 100644 index bd847f6efd..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDictionaryStreamReader.java +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class LongDictionaryStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dictionaryDataStreamSource = missingStreamSource(LongStream.class); - private int dictionarySize; - @Nonnull - private long[] dictionary = new long[0]; - - @Nonnull - private StreamSource inDictionaryStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream inDictionaryStream; - private final boolean[] inDictionary = new boolean[Vector.MAX_VECTOR_LENGTH]; - - @Nonnull - private StreamSource dataStreamSource; - @Nullable - private LongStream dataStream; - - private boolean dictionaryOpen; - private boolean rowGroupOpen; - - public LongDictionaryStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the length reader - readOffset = presentStream.countBitsSet(readOffset); - } - - if (inDictionaryStream != null) { - inDictionaryStream.skip(readOffset); - } - - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - LongVector longVector = (LongVector) vector; - - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(longVector.isNull, false); - dataStream.nextLongVector(nextBatchSize, longVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); - } - } - - if (inDictionaryStream == null) { - Arrays.fill(inDictionary, true); - } - else { - inDictionaryStream.getSetBits(nextBatchSize, inDictionary, longVector.isNull); - } - - for (int i = 0; i < nextBatchSize; i++) { - if (!longVector.isNull[i]) { - if (inDictionary[i]) { - longVector.vector[i] = dictionary[((int) longVector.vector[i])]; - } - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - // read the dictionary - if (!dictionaryOpen && dictionarySize > 0) { - if (dictionary.length < dictionarySize) { - dictionary = new long[dictionarySize]; - } - - LongStream dictionaryStream = dictionaryDataStreamSource.openStream(); - verifyFormat(dictionaryStream != null, "Dictionary is not empty but data stream is not present"); - dictionaryStream.nextLongVector(dictionarySize, dictionary); - } - dictionaryOpen = true; - - presentStream = presentStreamSource.openStream(); - inDictionaryStream = inDictionaryStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - dictionaryDataStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, LongStream.class); - dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); - dictionaryOpen = false; - - inDictionaryStreamSource = missingStreamSource(BooleanStream.class); - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - inDictionaryStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - inDictionaryStreamSource = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - inDictionaryStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java deleted file mode 100644 index b50201cc0e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongDirectStreamReader.java +++ /dev/null @@ -1,155 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.DATA; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.PRESENT; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class LongDirectStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(LongStream.class); - @Nullable - private LongStream dataStream; - - private boolean rowGroupOpen; - - public LongDirectStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(readOffset); - } - } - - LongVector longVector = (LongVector) vector; - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(longVector.isNull, false); - dataStream.nextLongVector(nextBatchSize, longVector.vector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java deleted file mode 100644 index 6943049acd..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/LongStreamReader.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; - -public class LongStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - private final LongDirectStreamReader directReader; - private final LongDictionaryStreamReader dictionaryReader; - private StreamReader currentReader; - - public LongStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - directReader = new LongDirectStreamReader(streamDescriptor); - dictionaryReader = new LongDictionaryStreamReader(streamDescriptor); - } - - @Override - public void prepareNextRead(int batchSize) - { - currentReader.prepareNextRead(batchSize); - } - - @Override - public void readBatch(Object vector) - throws IOException - { - currentReader.readBatch(vector); - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - ColumnEncodingKind kind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); - if (kind == DIRECT || kind == DIRECT_V2 || kind == DWRF_DIRECT) { - currentReader = directReader; - } - else if (kind == DICTIONARY) { - currentReader = dictionaryReader; - } - else { - throw new IllegalArgumentException("Unsupported encoding " + kind); - } - - currentReader.startStripe(dictionaryStreamSources, encoding); - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - currentReader.startRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java deleted file mode 100644 index bf7f362be5..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDictionaryStreamReader.java +++ /dev/null @@ -1,287 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.SliceVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.*; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class SliceDictionaryStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; - - @Nonnull - private StreamSource dictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); - private boolean dictionaryOpen; - private int dictionarySize; - @Nonnull - private Slice[] dictionary = new Slice[0]; - - @Nonnull - private StreamSource dictionaryLengthStreamSource = missingStreamSource(LongStream.class); - @Nonnull - private int[] dictionaryLength = new int[0]; - - @Nonnull - private StreamSource inDictionaryStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream inDictionaryStream; - private final boolean[] inDictionary = new boolean[Vector.MAX_VECTOR_LENGTH]; - - @Nonnull - private StreamSource rowGroupDictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); - @Nonnull - private Slice[] rowGroupDictionary = new Slice[0]; - - @Nonnull - private StreamSource rowGroupDictionaryLengthStreamSource = missingStreamSource(RowGroupDictionaryLengthStream.class); - @Nonnull - private int[] rowGroupDictionaryLength = new int[0]; - - @Nonnull - private StreamSource dataStreamSource = missingStreamSource(LongStream.class); - @Nullable - private LongStream dataStream; - @Nonnull - private final int[] dataVector = new int[Vector.MAX_VECTOR_LENGTH]; - - private boolean rowGroupOpen; - - public SliceDictionaryStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the length reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - if (inDictionaryStream != null) { - inDictionaryStream.skip(readOffset); - } - dataStream.skip(readOffset); - } - } - - SliceVector sliceVector = (SliceVector) vector; - - if (presentStream == null) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - Arrays.fill(isNullVector, false); - dataStream.nextIntVector(nextBatchSize, dataVector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, isNullVector); - if (nullValues != nextBatchSize) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.nextIntVector(nextBatchSize, dataVector, isNullVector); - } - } - - if (inDictionaryStream == null) { - Arrays.fill(inDictionary, true); - } - else { - inDictionaryStream.getSetBits(nextBatchSize, inDictionary, isNullVector); - } - - for (int i = 0; i < nextBatchSize; i++) { - if (isNullVector[i]) { - sliceVector.vector[i] = null; - } - else if (inDictionary[i]) { - sliceVector.vector[i] = dictionary[dataVector[i]]; - } - else { - sliceVector.vector[i] = rowGroupDictionary[dataVector[i]]; - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - // read the dictionary - if (!dictionaryOpen && dictionarySize > 0) { - // resize the dictionary array if necessary - if (dictionary.length < dictionarySize) { - dictionary = new Slice[dictionarySize]; - dictionaryLength = new int[dictionarySize]; - } - - // read the lengths - LongStream lengthStream = dictionaryLengthStreamSource.openStream(); - verifyFormat(lengthStream != null, "Dictionary is not empty but dictionary length stream is not present"); - lengthStream.nextIntVector(dictionarySize, dictionaryLength); - - ByteArrayStream dictionaryDataStream = dictionaryDataStreamSource.openStream(); - readDictionary(dictionaryDataStream, dictionarySize, dictionaryLength, dictionary); - } - dictionaryOpen = true; - - // read row group dictionary - RowGroupDictionaryLengthStream dictionaryLengthStream = rowGroupDictionaryLengthStreamSource.openStream(); - if (dictionaryLengthStream != null) { - int rowGroupDictionarySize = dictionaryLengthStream.getEntryCount(); - - // resize the dictionary array if necessary - if (rowGroupDictionary.length < rowGroupDictionarySize) { - rowGroupDictionary = new Slice[rowGroupDictionarySize]; - rowGroupDictionaryLength = new int[rowGroupDictionarySize]; - } - - // read the lengths - dictionaryLengthStream.nextIntVector(rowGroupDictionarySize, rowGroupDictionaryLength); - - ByteArrayStream dictionaryDataStream = rowGroupDictionaryDataStreamSource.openStream(); - readDictionary(dictionaryDataStream, rowGroupDictionarySize, rowGroupDictionaryLength, rowGroupDictionary); - } - dictionaryOpen = true; - - presentStream = presentStreamSource.openStream(); - inDictionaryStream = inDictionaryStreamSource.openStream(); - dataStream = dataStreamSource.openStream(); - - rowGroupOpen = true; - } - - private static void readDictionary(@Nullable ByteArrayStream dictionaryDataStream, int dictionarySize, int[] dictionaryLength, Slice[] dictionary) - throws IOException - { - // sum lengths - int totalLength = 0; - for (int i = 0; i < dictionarySize; i++) { - totalLength += dictionaryLength[i]; - } - - // read dictionary data - byte[] dictionaryData = new byte[0]; - if (totalLength > 0) { - verifyFormat(dictionaryDataStream != null, "Dictionary length is not zero but dictionary data stream is not present"); - dictionaryData = dictionaryDataStream.next(totalLength); - } - - // build dictionary slices - int offset = 0; - for (int i = 0; i < dictionarySize; i++) { - int length = dictionaryLength[i]; - dictionary[i] = Slices.wrappedBuffer(dictionaryData, offset, length); - offset += length; - } - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - dictionaryDataStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, DICTIONARY_DATA, ByteArrayStream.class); - dictionaryLengthStreamSource = dictionaryStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class); - dictionarySize = encoding.get(streamDescriptor.getStreamId()).getDictionarySize(); - dictionaryOpen = false; - - presentStreamSource = missingStreamSource(BooleanStream.class); - dataStreamSource = missingStreamSource(LongStream.class); - - inDictionaryStreamSource = missingStreamSource(BooleanStream.class); - rowGroupDictionaryLengthStreamSource = missingStreamSource(RowGroupDictionaryLengthStream.class); - rowGroupDictionaryDataStreamSource = missingStreamSource(ByteArrayStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - inDictionaryStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - dataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); - - // the "in dictionary" stream signals if the value is in the stripe or row group dictionary - inDictionaryStreamSource = dataStreamSources.getStreamSource(streamDescriptor, IN_DICTIONARY, BooleanStream.class); - rowGroupDictionaryLengthStreamSource = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY_LENGTH, RowGroupDictionaryLengthStream.class); - rowGroupDictionaryDataStreamSource = dataStreamSources.getStreamSource(streamDescriptor, ROW_GROUP_DICTIONARY, ByteArrayStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - inDictionaryStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java deleted file mode 100644 index 994b25d29a..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceDirectStreamReader.java +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.SliceVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.*; -import io.airlift.slice.Slices; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class SliceDirectStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - private final boolean[] isNullVector = new boolean[Vector.MAX_VECTOR_LENGTH]; - - @Nonnull - private StreamSource lengthStreamSource = missingStreamSource(LongStream.class); - @Nullable - private LongStream lengthStream; - private final int[] lengthVector = new int[Vector.MAX_VECTOR_LENGTH]; - - @Nonnull - private StreamSource dataByteSource = missingStreamSource(ByteArrayStream.class); - @Nullable - private ByteArrayStream dataStream; - - private boolean rowGroupOpen; - - public SliceDirectStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the length reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - long dataSkipSize = lengthStream.sum(readOffset); - if (dataSkipSize > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - dataStream.skip(Ints.checkedCast(dataSkipSize)); - } - } - } - - SliceVector sliceVector = (SliceVector) vector; - if (presentStream == null) { - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - lengthStream.nextIntVector(nextBatchSize, lengthVector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, isNullVector); - if (nullValues != nextBatchSize) { - verifyFormat(lengthStream != null, "Value is not null but length stream is not present"); - lengthStream.nextIntVector(nextBatchSize, lengthVector, isNullVector); - } - } - - int totalLength = 0; - for (int i = 0; i < nextBatchSize; i++) { - if (!isNullVector[i]) { - totalLength += lengthVector[i]; - } - } - - byte[] data = new byte[0]; - if (totalLength > 0) { - verifyFormat(dataStream != null, "Value is not null but data stream is not present"); - data = dataStream.next(totalLength); - } - - int offset = 0; - for (int i = 0; i < nextBatchSize; i++) { - if (!isNullVector[i]) { - int length = lengthVector[i]; - sliceVector.vector[i] = Slices.wrappedBuffer(data, offset, length); - offset += length; - } - else { - sliceVector.vector[i] = null; - } - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - lengthStream = lengthStreamSource.openStream(); - dataStream = dataByteSource.openStream(); - - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - lengthStreamSource = missingStreamSource(LongStream.class); - dataByteSource = missingStreamSource(ByteArrayStream.class); - - readOffset = 0; - nextBatchSize = 0; - - Arrays.fill(isNullVector, false); - - presentStream = null; - lengthStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - lengthStreamSource = dataStreamSources.getStreamSource(streamDescriptor, LENGTH, LongStream.class); - dataByteSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, ByteArrayStream.class); - - readOffset = 0; - nextBatchSize = 0; - - Arrays.fill(isNullVector, false); - - presentStream = null; - lengthStream = null; - dataStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java deleted file mode 100644 index e046dff632..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/SliceStreamReader.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; - -import java.io.IOException; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding.ColumnEncodingKind.*; - -public class SliceStreamReader - implements StreamReader -{ - private final StreamDescriptor streamDescriptor; - private final SliceDirectStreamReader directReader; - private final SliceDictionaryStreamReader dictionaryReader; - private StreamReader currentReader; - - public SliceStreamReader(StreamDescriptor streamDescriptor) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - directReader = new SliceDirectStreamReader(streamDescriptor); - dictionaryReader = new SliceDictionaryStreamReader(streamDescriptor); - } - - @Override - public void readBatch(Object vector) - throws IOException - { - currentReader.readBatch(vector); - } - - @Override - public void prepareNextRead(int batchSize) - { - currentReader.prepareNextRead(batchSize); - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - ColumnEncodingKind columnEncodingKind = encoding.get(streamDescriptor.getStreamId()).getColumnEncodingKind(); - if (columnEncodingKind == DIRECT || columnEncodingKind == DIRECT_V2 || columnEncodingKind == DWRF_DIRECT) { - currentReader = directReader; - } - else if (columnEncodingKind == DICTIONARY || columnEncodingKind == DICTIONARY_V2) { - currentReader = dictionaryReader; - } - else { - throw new IllegalArgumentException("Unsupported encoding " + columnEncodingKind); - } - - currentReader.startStripe(dictionaryStreamSources, encoding); - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - currentReader.startRowGroup(dataStreamSources); - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java deleted file mode 100644 index 7d0e8cc9f2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/StreamReaders.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.joda.time.DateTimeZone; - -public final class StreamReaders -{ - private StreamReaders() - { - } - - public static StreamReader createStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) - { - switch (streamDescriptor.getStreamType()) { - case BOOLEAN: - return new BooleanStreamReader(streamDescriptor); - case BYTE: - return new ByteStreamReader(streamDescriptor); - case SHORT: - case INT: - case LONG: - case DATE: - return new LongStreamReader(streamDescriptor); - case FLOAT: - return new FloatStreamReader(streamDescriptor); - case DOUBLE: - return new DoubleStreamReader(streamDescriptor); - case BINARY: - case STRING: - return new SliceStreamReader(streamDescriptor); - case TIMESTAMP: - return new TimestampStreamReader(streamDescriptor, hiveStorageTimeZone); - case STRUCT: - case LIST: - case MAP: - return new JsonStreamReader(streamDescriptor, hiveStorageTimeZone); - case UNION: - case DECIMAL: - case VARCHAR: - case CHAR: - default: - throw new IllegalArgumentException("Unsupported type: " + streamDescriptor.getStreamType()); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java deleted file mode 100644 index ba96f7cdcb..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/reader/TimestampStreamReader.java +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.reader; - -import org.apache.tajo.storage.thirdparty.orc.LongVector; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.metadata.ColumnEncoding; -import org.apache.tajo.storage.thirdparty.orc.stream.BooleanStream; -import org.apache.tajo.storage.thirdparty.orc.stream.LongStream; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSource; -import org.apache.tajo.storage.thirdparty.orc.stream.StreamSources; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; - -import javax.annotation.Nonnull; -import javax.annotation.Nullable; -import java.io.IOException; -import java.util.Arrays; -import java.util.List; - -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class TimestampStreamReader - implements StreamReader -{ - private static final int MILLIS_PER_SECOND = 1000; - - private final StreamDescriptor streamDescriptor; - private final long baseTimestampInSeconds; - - private int readOffset; - private int nextBatchSize; - - @Nonnull - private StreamSource presentStreamSource = missingStreamSource(BooleanStream.class); - @Nullable - private BooleanStream presentStream; - - @Nonnull - private StreamSource secondsStreamSource = missingStreamSource(LongStream.class); - @Nullable - private LongStream secondsStream; - - @Nonnull - private StreamSource nanosStreamSource = missingStreamSource(LongStream.class); - @Nullable - private LongStream nanosStream; - - private final long[] nanosVector = new long[Vector.MAX_VECTOR_LENGTH]; - - private boolean rowGroupOpen; - - public TimestampStreamReader(StreamDescriptor streamDescriptor, DateTimeZone hiveStorageTimeZone) - { - this.streamDescriptor = checkNotNull(streamDescriptor, "stream is null"); - this.baseTimestampInSeconds = new DateTime(2015, 1, 1, 0, 0, checkNotNull(hiveStorageTimeZone, "hiveStorageTimeZone is null")).getMillis() / MILLIS_PER_SECOND; - } - - @Override - public void prepareNextRead(int batchSize) - { - readOffset += nextBatchSize; - nextBatchSize = batchSize; - } - - @Override - public void readBatch(Object vector) - throws IOException - { - if (!rowGroupOpen) { - openRowGroup(); - } - - if (readOffset > 0) { - if (presentStream != null) { - // skip ahead the present bit reader, but count the set bits - // and use this as the skip size for the data reader - readOffset = presentStream.countBitsSet(readOffset); - } - if (readOffset > 0) { - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - secondsStream.skip(readOffset); - nanosStream.skip(readOffset); - } - } - - LongVector longVector = (LongVector) vector; - if (presentStream == null) { - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - Arrays.fill(longVector.isNull, false); - secondsStream.nextLongVector(nextBatchSize, longVector.vector); - nanosStream.nextLongVector(nextBatchSize, nanosVector); - } - else { - int nullValues = presentStream.getUnsetBits(nextBatchSize, longVector.isNull); - if (nullValues != nextBatchSize) { - verifyFormat(secondsStream != null, "Value is not null but seconds stream is not present"); - verifyFormat(nanosStream != null, "Value is not null but nanos stream is not present"); - - secondsStream.nextLongVector(nextBatchSize, longVector.vector, longVector.isNull); - nanosStream.nextLongVector(nextBatchSize, nanosVector, longVector.isNull); - } - } - - // merge seconds and nanos together - for (int i = 0; i < nextBatchSize; i++) { - longVector.vector[i] = decodeTimestamp(longVector.vector[i], nanosVector[i], baseTimestampInSeconds); - } - - readOffset = 0; - nextBatchSize = 0; - } - - private void openRowGroup() - throws IOException - { - presentStream = presentStreamSource.openStream(); - secondsStream = secondsStreamSource.openStream(); - nanosStream = nanosStreamSource.openStream(); - rowGroupOpen = true; - } - - @Override - public void startStripe(StreamSources dictionaryStreamSources, List encoding) - throws IOException - { - presentStreamSource = missingStreamSource(BooleanStream.class); - secondsStreamSource = missingStreamSource(LongStream.class); - nanosStreamSource = missingStreamSource(LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - secondsStream = null; - nanosStream = null; - - rowGroupOpen = false; - } - - @Override - public void startRowGroup(StreamSources dataStreamSources) - throws IOException - { - presentStreamSource = dataStreamSources.getStreamSource(streamDescriptor, PRESENT, BooleanStream.class); - secondsStreamSource = dataStreamSources.getStreamSource(streamDescriptor, DATA, LongStream.class); - nanosStreamSource = dataStreamSources.getStreamSource(streamDescriptor, SECONDARY, LongStream.class); - - readOffset = 0; - nextBatchSize = 0; - - presentStream = null; - secondsStream = null; - nanosStream = null; - - rowGroupOpen = false; - } - - @Override - public String toString() - { - return toStringHelper(this) - .addValue(streamDescriptor) - .toString(); - } - - // This comes from the Apache Hive ORC code - public static long decodeTimestamp(long seconds, long serializedNanos, long baseTimestampInSeconds) - { - long millis = (seconds + baseTimestampInSeconds) * MILLIS_PER_SECOND; - long nanos = parseNanos(serializedNanos); - - // the rounding error exists because java always rounds up when dividing integers - // -42001/1000 = -42; and -42001 % 1000 = -1 (+ 1000) - // to get the correct value we need - // (-42 - 1)*1000 + 999 = -42001 - // (42)*1000 + 1 = 42001 - if (millis < 0 && nanos != 0) { - millis -= 1000; - } - // Truncate nanos to millis and add to mills - return millis + (nanos / 1000000); - } - - // This comes from the Apache Hive ORC code - private static int parseNanos(long serialized) - { - int zeros = ((int) serialized) & 0x7; // 0b111 - int result = (int) (serialized >>> 3); - if (zeros != 0) { - for (int i = 0; i <= zeros; ++i) { - result *= 10; - } - } - return result; - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java deleted file mode 100644 index 853609af56..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteArrayStream.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.checkpoint.ByteArrayStreamCheckpoint; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; - -public class ByteArrayStream - implements ValueStream -{ - private final OrcInputStream inputStream; - - public ByteArrayStream(OrcInputStream inputStream) - { - this.inputStream = checkNotNull(inputStream, "inputStream is null"); - } - - public byte[] next(int length) - throws IOException - { - byte[] data = new byte[length]; - readFully(inputStream, data, 0, length); - return data; - } - - public void next(int length, byte[] data) - throws IOException - { - readFully(inputStream, data, 0, length); - } - - @Override - public Class getCheckpointType() - { - return ByteArrayStreamCheckpoint.class; - } - - @Override - public void seekToCheckpoint(ByteArrayStreamCheckpoint checkpoint) - throws IOException - { - inputStream.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); - } - - @Override - public void skip(int skipSize) - throws IOException - { - skipFully(inputStream, skipSize); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java deleted file mode 100644 index adb27cbeb9..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/ByteStream.java +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.checkpoint.ByteStreamCheckpoint; - -import java.io.IOException; -import java.util.Arrays; - -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; - -public class ByteStream - implements ValueStream -{ - private final OrcInputStream input; - private final byte[] buffer = new byte[MIN_REPEAT_SIZE + 127]; - private int length; - private int offset; - private long lastReadInputCheckpoint; - - public ByteStream(OrcInputStream input) - { - this.input = input; - lastReadInputCheckpoint = input.getCheckpoint(); - } - - // This is based on the Apache Hive ORC code - private void readNextBlock() - throws IOException - { - lastReadInputCheckpoint = input.getCheckpoint(); - - int control = input.read(); - verifyFormat(control != -1, "Read past end of buffer RLE byte from %s", input); - - offset = 0; - - // if byte high bit is not set, this is a repetition; otherwise it is a literal sequence - if ((control & 0x80) == 0) { - length = control + MIN_REPEAT_SIZE; - - // read the repeated value - int value = input.read(); - verifyFormat(value != -1, "Reading RLE byte got EOF"); - - // fill buffer with the value - Arrays.fill(buffer, 0, length, (byte) value); - } - else { - // length is 2's complement of byte - length = 0x100 - control; - - // read the literals into the buffer - readFully(input, buffer, 0, length); - } - } - - @Override - public Class getCheckpointType() - { - return ByteStreamCheckpoint.class; - } - - @Override - public void seekToCheckpoint(ByteStreamCheckpoint checkpoint) - throws IOException - { - // if the checkpoint is within the current buffer, just adjust the pointer - if (lastReadInputCheckpoint == checkpoint.getInputStreamCheckpoint() && checkpoint.getOffset() <= length) { - offset = checkpoint.getOffset(); - } - else { - // otherwise, discard the buffer and start over - input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); - length = 0; - offset = 0; - skip(checkpoint.getOffset()); - } - } - - @Override - public void skip(int items) - throws IOException - { - while (items > 0) { - if (offset == length) { - readNextBlock(); - } - long consume = Math.min(items, length - offset); - offset += consume; - items -= consume; - } - } - - public byte next() - throws IOException - { - if (offset == length) { - readNextBlock(); - } - return buffer[offset++]; - } - - public void nextVector(long items, long[] vector) - throws IOException - { - for (int i = 0; i < items; i++) { - vector[i] = next(); - } - } - - public void nextVector(long items, long[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = next(); - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java deleted file mode 100644 index 6c3e5ea6c9..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/CheckpointStreamSource.java +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.StreamCheckpoint; - -import javax.annotation.Nullable; -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkNotNull; - -public class CheckpointStreamSource, C extends StreamCheckpoint> - implements StreamSource -{ - public static , C extends StreamCheckpoint> CheckpointStreamSource createCheckpointStreamSource(S stream, StreamCheckpoint checkpoint) - { - checkNotNull(stream, "stream is null"); - checkNotNull(checkpoint, "checkpoint is null"); - - Class checkpointType = stream.getCheckpointType(); - C verifiedCheckpoint = OrcStreamUtils.checkType(checkpoint, checkpointType, "Checkpoint"); - return new CheckpointStreamSource(stream, verifiedCheckpoint); - } - - private final S stream; - private final C checkpoint; - - public CheckpointStreamSource(S stream, C checkpoint) - { - this.stream = checkNotNull(stream, "stream is null"); - this.checkpoint = checkNotNull(checkpoint, "checkpoint is null"); - } - - @Override - public Class getStreamType() - { - return (Class) stream.getClass(); - } - - @Nullable - @Override - public S openStream() - throws IOException - { - stream.seekToCheckpoint(checkpoint); - return stream; - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("stream", stream) - .add("checkpoint", checkpoint) - .toString(); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java deleted file mode 100644 index 08f1f160e2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/DoubleStream.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.DoubleStreamCheckpoint; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkPositionIndex; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; -import static io.airlift.slice.SizeOf.SIZE_OF_DOUBLE; - -public class DoubleStream - implements ValueStream -{ - private final OrcInputStream input; - private final byte[] buffer = new byte[Vector.MAX_VECTOR_LENGTH * SIZE_OF_DOUBLE]; - private final Slice slice = Slices.wrappedBuffer(buffer); - - public DoubleStream(OrcInputStream input) - { - this.input = input; - } - - @Override - public Class getCheckpointType() - { - return DoubleStreamCheckpoint.class; - } - - @Override - public void seekToCheckpoint(DoubleStreamCheckpoint checkpoint) - throws IOException - { - input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); - } - - @Override - public void skip(int items) - throws IOException - { - long length = items * SIZE_OF_DOUBLE; - skipFully(input, length); - } - - public double next() - throws IOException - { - readFully(input, buffer, 0, SIZE_OF_DOUBLE); - return slice.getDouble(0); - } - - public void nextVector(int items, double[] vector) - throws IOException - { - checkPositionIndex(items, vector.length); - checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); - - // buffer that number of values - readFully(input, buffer, 0, items * SIZE_OF_DOUBLE); - - // copy values directly into vector - Slices.wrappedDoubleArray(vector).setBytes(0, slice, 0, items * SIZE_OF_DOUBLE); - } - - public void nextVector(long items, double[] vector, boolean[] isNull) - throws IOException - { - // count the number of non nulls - int notNullCount = 0; - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - notNullCount++; - } - } - - // buffer that umber of values - readFully(input, buffer, 0, notNullCount * SIZE_OF_DOUBLE); - - // load them into the buffer - int elementIndex = 0; - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = slice.getDouble(elementIndex); - elementIndex += SIZE_OF_DOUBLE; - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java deleted file mode 100644 index 722c9470fd..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/FloatStream.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.FloatStreamCheckpoint; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkPositionIndex; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.readFully; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.skipFully; -import static io.airlift.slice.SizeOf.SIZE_OF_FLOAT; - -public class FloatStream - implements ValueStream -{ - private final OrcInputStream input; - private final byte[] buffer = new byte[Vector.MAX_VECTOR_LENGTH * SIZE_OF_FLOAT]; - private final Slice slice = Slices.wrappedBuffer(buffer); - - public FloatStream(OrcInputStream input) - { - this.input = input; - } - - @Override - public Class getCheckpointType() - { - return FloatStreamCheckpoint.class; - } - - @Override - public void seekToCheckpoint(FloatStreamCheckpoint checkpoint) - throws IOException - { - input.seekToCheckpoint(checkpoint.getInputStreamCheckpoint()); - } - - @Override - public void skip(int items) - throws IOException - { - long length = items * SIZE_OF_FLOAT; - skipFully(input, length); - } - - public float next() - throws IOException - { - readFully(input, buffer, 0, SIZE_OF_FLOAT); - return slice.getFloat(0); - } - - public void nextVector(int items, double[] vector) - throws IOException - { - checkPositionIndex(items, vector.length); - checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); - - // buffer that number of values - readFully(input, buffer, 0, items * SIZE_OF_FLOAT); - - // load them into the buffer one at a time since we are reading - // floats into a double vector - int elementIndex = 0; - for (int i = 0; i < items; i++) { - vector[i] = slice.getFloat(elementIndex); - elementIndex += SIZE_OF_FLOAT; - } - } - - public void nextVector(long items, double[] vector, boolean[] isNull) - throws IOException - { - // count the number of non nulls - int notNullCount = 0; - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - notNullCount++; - } - } - - // buffer that umber of values - readFully(input, buffer, 0, notNullCount * SIZE_OF_FLOAT); - - // load them into the buffer - int elementIndex = 0; - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = slice.getFloat(elementIndex); - elementIndex += SIZE_OF_FLOAT; - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java deleted file mode 100644 index 40753bfe75..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongDecode.java +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; - -import java.io.IOException; -import java.io.InputStream; - -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.FixedBitSizes.*; -import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.FixedBitSizes.*; - -// This is based on the Apache Hive ORC code -public final class LongDecode -{ - private LongDecode() - { - } - - enum FixedBitSizes - { - ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, - THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, - TWENTY, TWENTY_ONE, TWENTY_TWO, TWENTY_THREE, TWENTY_FOUR, TWENTY_SIX, - TWENTY_EIGHT, THIRTY, THIRTY_TWO, FORTY, FORTY_EIGHT, FIFTY_SIX, SIXTY_FOUR; - } - - /** - * Decodes the ordinal fixed bit value to actual fixed bit width value. - */ - public static int decodeBitWidth(int n) - { - if (n >= ONE.ordinal() && n <= TWENTY_FOUR.ordinal()) { - return n + 1; - } - else if (n == TWENTY_SIX.ordinal()) { - return 26; - } - else if (n == TWENTY_EIGHT.ordinal()) { - return 28; - } - else if (n == THIRTY.ordinal()) { - return 30; - } - else if (n == THIRTY_TWO.ordinal()) { - return 32; - } - else if (n == FORTY.ordinal()) { - return 40; - } - else if (n == FORTY_EIGHT.ordinal()) { - return 48; - } - else if (n == FIFTY_SIX.ordinal()) { - return 56; - } - else { - return 64; - } - } - - /** - * Gets the closest supported fixed bit width for the specified bit width. - */ - public static int getClosestFixedBits(int width) - { - if (width == 0) { - return 1; - } - - if (width >= 1 && width <= 24) { - return width; - } - else if (width > 24 && width <= 26) { - return 26; - } - else if (width > 26 && width <= 28) { - return 28; - } - else if (width > 28 && width <= 30) { - return 30; - } - else if (width > 30 && width <= 32) { - return 32; - } - else if (width > 32 && width <= 40) { - return 40; - } - else if (width > 40 && width <= 48) { - return 48; - } - else if (width > 48 && width <= 56) { - return 56; - } - else { - return 64; - } - } - - public static long readSignedVInt(InputStream inputStream) - throws IOException - { - long result = readUnsignedVInt(inputStream); - return (result >>> 1) ^ -(result & 1); - } - - public static long readUnsignedVInt(InputStream inputStream) - throws IOException - { - long result = 0; - int offset = 0; - long b; - do { - b = inputStream.read(); - verifyFormat(b != -1, "EOF while reading unsigned vint"); - result |= (b & 0x7F /* 0b0111_1111 */) << offset; - offset += 7; - } while ((b & 0x80 /* 0b1000_0000 */) != 0); - return result; - } - - public static long readVInt(boolean signed, InputStream inputStream) - throws IOException - { - if (signed) { - return readSignedVInt(inputStream); - } - else { - return readUnsignedVInt(inputStream); - } - } - - public static long zigzagDecode(long value) - { - return (value >>> 1) ^ -(value & 1); - } - - public static long readDwrfLong(InputStream input, OrcTypeKind type, boolean signed, boolean usesVInt) - throws IOException - { - if (usesVInt) { - return readVInt(signed, input); - } - else if (type == SHORT) { - return input.read() | (input.read() << 8); - } - else if (type == INT) { - return input.read() | (input.read() << 8) | (input.read() << 16) | (input.read() << 24); - } - else if (type == LONG) { - return ((long) input.read()) | - (((long) input.read()) << 8) | - (((long) input.read()) << 16) | - (((long) input.read()) << 24) | - (((long) input.read()) << 32) | - (((long) input.read()) << 40) | - (((long) input.read()) << 48) | - (((long) input.read()) << 56); - } - else { - throw new IllegalArgumentException(type + " type is not supported"); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java deleted file mode 100644 index e037be6c3e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamDwrf.java +++ /dev/null @@ -1,129 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.Vector; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamDwrfCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.metadata.OrcType.OrcTypeKind; - -import java.io.IOException; - -import static com.google.common.base.Preconditions.checkPositionIndex; -import static org.apache.tajo.storage.thirdparty.orc.stream.LongDecode.readDwrfLong; - -public class LongStreamDwrf - implements LongStream -{ - private final OrcInputStream input; - private final OrcTypeKind orcTypeKind; - private final boolean signed; - private final boolean usesVInt; - - public LongStreamDwrf(OrcInputStream input, OrcTypeKind type, boolean signed, boolean usesVInt) - { - this.input = input; - this.orcTypeKind = type; - this.signed = signed; - this.usesVInt = usesVInt; - } - - @Override - public Class getCheckpointType() - { - return LongStreamDwrfCheckpoint.class; - } - - @Override - public void seekToCheckpoint(LongStreamCheckpoint checkpoint) - throws IOException - { - LongStreamDwrfCheckpoint dwrfCheckpoint = OrcStreamUtils.checkType(checkpoint, LongStreamDwrfCheckpoint.class, "Checkpoint"); - input.seekToCheckpoint(dwrfCheckpoint.getInputStreamCheckpoint()); - } - - @Override - public void skip(int items) - throws IOException - { - // there is no fast way to skip values - for (int i = 0; i < items; i++) { - next(); - } - } - - @Override - public long sum(int items) - throws IOException - { - long sum = 0; - for (int i = 0; i < items; i++) { - sum += next(); - } - return sum; - } - - @Override - public long next() - throws IOException - { - return readDwrfLong(input, orcTypeKind, signed, usesVInt); - } - - @Override - public void nextIntVector(int items, int[] vector) - throws IOException - { - checkPositionIndex(items, vector.length); - checkPositionIndex(items, Vector.MAX_VECTOR_LENGTH); - - for (int i = 0; i < items; i++) { - vector[i] = Ints.checkedCast(next()); - } - } - - @Override - public void nextIntVector(int items, int[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = Ints.checkedCast(next()); - } - } - } - - @Override - public void nextLongVector(int items, long[] vector) - throws IOException - { - checkPositionIndex(items, vector.length); - - for (int i = 0; i < items; i++) { - vector[i] = next(); - } - } - - @Override - public void nextLongVector(int items, long[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = next(); - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java deleted file mode 100644 index 29a6d25ef6..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV1.java +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamV1Checkpoint; - -import java.io.IOException; - -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; - -public class LongStreamV1 - implements LongStream -{ - private static final int MAX_LITERAL_SIZE = 128; - - private final OrcInputStream input; - private final boolean signed; - private final long[] literals = new long[MAX_LITERAL_SIZE]; - private int numLiterals; - private int delta; - private int used; - private boolean repeat; - private long lastReadInputCheckpoint; - - public LongStreamV1(OrcInputStream input, boolean signed) - { - this.input = input; - this.signed = signed; - lastReadInputCheckpoint = input.getCheckpoint(); - } - - // This comes from the Apache Hive ORC code - private void readValues() - throws IOException - { - lastReadInputCheckpoint = input.getCheckpoint(); - - int control = input.read(); - verifyFormat(control != -1, "Read past end of RLE integer from %s", input); - - if (control < 0x80) { - numLiterals = control + MIN_REPEAT_SIZE; - used = 0; - repeat = true; - delta = input.read(); - verifyFormat(delta != -1, "End of stream in RLE Integer from %s", input); - - // convert from 0 to 255 to -128 to 127 by converting to a signed byte - // noinspection SillyAssignment - delta = (byte) delta; - literals[0] = LongDecode.readVInt(signed, input); - } - else { - numLiterals = 0x100 - control; - used = 0; - repeat = false; - for (int i = 0; i < numLiterals; ++i) { - literals[i] = LongDecode.readVInt(signed, input); - } - } - } - - @Override - // This comes from the Apache Hive ORC code - public long next() - throws IOException - { - long result; - if (used == numLiterals) { - readValues(); - } - if (repeat) { - result = literals[0] + (used++) * delta; - } - else { - result = literals[used++]; - } - return result; - } - - @Override - public Class getCheckpointType() - { - return LongStreamV1Checkpoint.class; - } - - @Override - public void seekToCheckpoint(LongStreamCheckpoint checkpoint) - throws IOException - { - LongStreamV1Checkpoint v1Checkpoint = OrcStreamUtils.checkType(checkpoint, LongStreamV1Checkpoint.class, "Checkpoint"); - - // if the checkpoint is within the current buffer, just adjust the pointer - if (lastReadInputCheckpoint == v1Checkpoint.getInputStreamCheckpoint() && v1Checkpoint.getOffset() <= numLiterals) { - used = v1Checkpoint.getOffset(); - } - else { - // otherwise, discard the buffer and start over - input.seekToCheckpoint(v1Checkpoint.getInputStreamCheckpoint()); - numLiterals = 0; - used = 0; - skip(v1Checkpoint.getOffset()); - } - } - - @Override - public void skip(int items) - throws IOException - { - while (items > 0) { - if (used == numLiterals) { - readValues(); - } - long consume = Math.min(items, numLiterals - used); - used += consume; - items -= consume; - } - } - - @Override - public long sum(int items) - throws IOException - { - long sum = 0; - for (int i = 0; i < items; i++) { - sum += next(); - } - return sum; - } - - @Override - public void nextLongVector(int items, long[] vector) - throws IOException - { - for (int i = 0; i < items; i++) { - vector[i] = next(); - } - } - - @Override - public void nextLongVector(int items, long[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = next(); - } - } - } - - @Override - public void nextIntVector(int items, int[] vector) - throws IOException - { - for (int i = 0; i < items; i++) { - vector[i] = Ints.checkedCast(next()); - } - } - - @Override - public void nextIntVector(int items, int[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = Ints.checkedCast(next()); - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java deleted file mode 100644 index f22b3681d2..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/LongStreamV2.java +++ /dev/null @@ -1,452 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.primitives.Ints; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamCheckpoint; -import org.apache.tajo.storage.thirdparty.orc.checkpoint.LongStreamV2Checkpoint; - -import java.io.IOException; -import java.io.InputStream; - -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.stream.OrcStreamUtils.MIN_REPEAT_SIZE; - -/** - * @see {@link org.apache.hadoop.hive.ql.io.orc.RunLengthIntegerWriterV2} for description of various lightweight compression techniques. - */ -// This comes from the Apache Hive ORC code -public class LongStreamV2 - implements LongStream -{ - private static final int MAX_LITERAL_SIZE = 512; - - private enum EncodingType - { - SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA - } - - private final OrcInputStream input; - private final boolean signed; - private final long[] literals = new long[MAX_LITERAL_SIZE]; - private int numLiterals; - private int used; - private final boolean skipCorrupt; - private long lastReadInputCheckpoint; - - public LongStreamV2(OrcInputStream input, boolean signed, boolean skipCorrupt) - { - this.input = input; - this.signed = signed; - this.skipCorrupt = skipCorrupt; - lastReadInputCheckpoint = input.getCheckpoint(); - } - - // This comes from the Apache Hive ORC code - private void readValues() - throws IOException - { - lastReadInputCheckpoint = input.getCheckpoint(); - - // read the first 2 bits and determine the encoding type - int firstByte = input.read(); - verifyFormat(firstByte >= 0, "Read past end of RLE integer from %s", input); - - int enc = (firstByte >>> 6) & 0x03; - if (EncodingType.SHORT_REPEAT.ordinal() == enc) { - readShortRepeatValues(firstByte); - } - else if (EncodingType.DIRECT.ordinal() == enc) { - readDirectValues(firstByte); - } - else if (EncodingType.PATCHED_BASE.ordinal() == enc) { - readPatchedBaseValues(firstByte); - } - else { - readDeltaValues(firstByte); - } - } - - // This comes from the Apache Hive ORC code - private void readDeltaValues(int firstByte) - throws IOException - { - // extract the number of fixed bits - int fixedBits = (firstByte >>> 1) & 0x1f; - if (fixedBits != 0) { - fixedBits = LongDecode.decodeBitWidth(fixedBits); - } - - // extract the blob run length - int length = (firstByte & 0x01) << 8; - length |= input.read(); - - // read the first value stored as vint - long firstVal = LongDecode.readVInt(signed, input); - - // store first value to result buffer - literals[numLiterals++] = firstVal; - - // if fixed bits is 0 then all values have fixed delta - long prevVal; - if (fixedBits == 0) { - // read the fixed delta value stored as vint (deltas can be negative even - // if all number are positive) - long fixedDelta = LongDecode.readSignedVInt(input); - - // add fixed deltas to adjacent values - for (int i = 0; i < length; i++) { - literals[numLiterals++] = literals[numLiterals - 2] + fixedDelta; - } - } - else { - long deltaBase = LongDecode.readSignedVInt(input); - // add delta base and first value - literals[numLiterals++] = firstVal + deltaBase; - prevVal = literals[numLiterals - 1]; - length -= 1; - - // write the unpacked values, add it to previous value and store final - // value to result buffer. if the delta base value is negative then it - // is a decreasing sequence else an increasing sequence - readBitPackedLongs(literals, numLiterals, length, fixedBits, input); - while (length > 0) { - if (deltaBase < 0) { - literals[numLiterals] = prevVal - literals[numLiterals]; - } - else { - literals[numLiterals] = prevVal + literals[numLiterals]; - } - prevVal = literals[numLiterals]; - length--; - numLiterals++; - } - } - } - - // This comes from the Apache Hive ORC code - private void readPatchedBaseValues(int firstByte) - throws IOException - { - // extract the number of fixed bits - int fb = LongDecode.decodeBitWidth((firstByte >>> 1) & 0x1F); // 0b1_1111 - - // extract the run length of data blob - int length = (firstByte & 0x01) << 8; - length |= input.read(); - // runs are always one off - length += 1; - - // extract the number of bytes occupied by base - int thirdByte = input.read(); - int baseWidth = (thirdByte >>> 5) & 0x07; // 0b0111 - // base width is one off - baseWidth += 1; - - // extract patch width - int patchWidth = LongDecode.decodeBitWidth(thirdByte & 0x1F); // 0b1_1111 - - // read fourth byte and extract patch gap width - int fourthByte = input.read(); - int patchGapWidth = (fourthByte >>> 5) & 0x07; // 0b0111 - // patch gap width is one off - patchGapWidth += 1; - - // extract the length of the patch list - int patchListLength = fourthByte & 0x1F; // 0b1_1111 - - // read the next base width number of bytes to extract base value - long base = bytesToLongBE(input, baseWidth); - long mask = (1L << ((baseWidth * 8) - 1)); - // if MSB of base value is 1 then base is negative value else positive - if ((base & mask) != 0) { - base = base & ~mask; - base = -base; - } - - // unpack the data blob - long[] unpacked = new long[length]; - readBitPackedLongs(unpacked, 0, length, fb, input); - - // unpack the patch blob - long[] unpackedPatch = new long[patchListLength]; - - verifyFormat((patchWidth + patchGapWidth) <= 64 || skipCorrupt, "ORC file is corrupt"); - - int bitSize = LongDecode.getClosestFixedBits(patchWidth + patchGapWidth); - readBitPackedLongs(unpackedPatch, 0, patchListLength, bitSize, input); - - // apply the patch directly when decoding the packed data - int patchIndex = 0; - long currentGap; - long currentPatch; - long patchMask = ((1L << patchWidth) - 1); - currentGap = unpackedPatch[patchIndex] >>> patchWidth; - currentPatch = unpackedPatch[patchIndex] & patchMask; - long actualGap = 0; - - // special case: gap is >255 then patch value will be 0. - // if gap is <=255 then patch value cannot be 0 - while (currentGap == 255 && currentPatch == 0) { - actualGap += 255; - patchIndex++; - currentGap = unpackedPatch[patchIndex] >>> patchWidth; - currentPatch = unpackedPatch[patchIndex] & patchMask; - } - // add the left over gap - actualGap += currentGap; - - // unpack data blob, patch it (if required), add base to get final result - for (int i = 0; i < unpacked.length; i++) { - if (i == actualGap) { - // extract the patch value - long patchedValue = unpacked[i] | (currentPatch << fb); - - // add base to patched value - literals[numLiterals++] = base + patchedValue; - - // increment the patch to point to next entry in patch list - patchIndex++; - - if (patchIndex < patchListLength) { - // read the next gap and patch - currentGap = unpackedPatch[patchIndex] >>> patchWidth; - currentPatch = unpackedPatch[patchIndex] & patchMask; - actualGap = 0; - - // special case: gap is >255 then patch will be 0. if gap is - // <=255 then patch cannot be 0 - while (currentGap == 255 && currentPatch == 0) { - actualGap += 255; - patchIndex++; - currentGap = unpackedPatch[patchIndex] >>> patchWidth; - currentPatch = unpackedPatch[patchIndex] & patchMask; - } - // add the left over gap - actualGap += currentGap; - - // next gap is relative to the current gap - actualGap += i; - } - } - else { - // no patching required. add base to unpacked value to get final value - literals[numLiterals++] = base + unpacked[i]; - } - } - - } - - // This comes from the Apache Hive ORC code - private void readDirectValues(int firstByte) - throws IOException - { - // extract the number of fixed bits - int fixedBits = LongDecode.decodeBitWidth((firstByte >>> 1) & 0x1F); // 0b1_1111 - - // extract the run length - int length = (firstByte & 0x01) << 8; - length |= input.read(); - // runs are one off - length += 1; - - // write the unpacked values and zigzag decode to result buffer - readBitPackedLongs(literals, numLiterals, length, fixedBits, input); - if (signed) { - for (int i = 0; i < length; i++) { - literals[numLiterals] = LongDecode.zigzagDecode(literals[numLiterals]); - numLiterals++; - } - } - else { - numLiterals += length; - } - } - - // This comes from the Apache Hive ORC code - private void readShortRepeatValues(int firstByte) - throws IOException - { - // read the number of bytes occupied by the value - int size = (firstByte >>> 3) & 0x07; // 0b0111 - // #bytes are one off - size += 1; - - // read the run length - int length = firstByte & 0x07; - // run lengths values are stored only after MIN_REPEAT value is met - length += MIN_REPEAT_SIZE; - - // read the repeated value which is store using fixed bytes - long val = bytesToLongBE(input, size); - - if (signed) { - val = LongDecode.zigzagDecode(val); - } - - // repeat the value for length times - for (int i = 0; i < length; i++) { - literals[numLiterals++] = val; - } - } - - // This comes from the Apache Hive ORC code - private static void readBitPackedLongs(long[] buffer, int offset, int len, int bitSize, InputStream input) - throws IOException - { - int bitsLeft = 0; - int current = 0; - - for (int i = offset; i < (offset + len); i++) { - long result = 0; - int bitsLeftToRead = bitSize; - while (bitsLeftToRead > bitsLeft) { - result <<= bitsLeft; - result |= current & ((1 << bitsLeft) - 1); - bitsLeftToRead -= bitsLeft; - current = input.read(); - bitsLeft = 8; - } - - // handle the left over bits - if (bitsLeftToRead > 0) { - result <<= bitsLeftToRead; - bitsLeft -= bitsLeftToRead; - result |= (current >> bitsLeft) & ((1 << bitsLeftToRead) - 1); - } - buffer[i] = result; - } - } - - /** - * Read n bytes in big endian order and convert to long. - */ - private static long bytesToLongBE(InputStream input, int n) - throws IOException - { - long out = 0; - long val; - while (n > 0) { - n--; - // store it in a long and then shift else integer overflow will occur - val = input.read(); - out |= (val << (n * 8)); - } - return out; - } - - @Override - public long next() - throws IOException - { - if (used == numLiterals) { - numLiterals = 0; - used = 0; - readValues(); - } - return literals[used++]; - } - - @Override - public Class getCheckpointType() - { - return LongStreamV2Checkpoint.class; - } - - @Override - public void seekToCheckpoint(LongStreamCheckpoint checkpoint) - throws IOException - { - LongStreamV2Checkpoint v2Checkpoint = OrcStreamUtils.checkType(checkpoint, LongStreamV2Checkpoint.class, "Checkpoint"); - - // if the checkpoint is within the current buffer, just adjust the pointer - if (lastReadInputCheckpoint == v2Checkpoint.getInputStreamCheckpoint() && v2Checkpoint.getOffset() <= numLiterals) { - used = v2Checkpoint.getOffset(); - } - else { - // otherwise, discard the buffer and start over - input.seekToCheckpoint(v2Checkpoint.getInputStreamCheckpoint()); - numLiterals = 0; - used = 0; - skip(v2Checkpoint.getOffset()); - } - } - - @Override - public void skip(int items) - throws IOException - { - while (items > 0) { - if (used == numLiterals) { - numLiterals = 0; - used = 0; - readValues(); - } - long consume = Math.min(items, numLiterals - used); - used += consume; - items -= consume; - } - } - - @Override - public long sum(int items) - throws IOException - { - long sum = 0; - for (int i = 0; i < items; i++) { - sum += next(); - } - return sum; - } - - @Override - public void nextLongVector(int items, long[] vector) - throws IOException - { - for (int i = 0; i < items; i++) { - vector[i] = next(); - } - } - - @Override - public void nextLongVector(int items, long[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = next(); - } - } - } - - @Override - public void nextIntVector(int items, int[] vector) - throws IOException - { - for (int i = 0; i < items; i++) { - vector[i] = Ints.checkedCast(next()); - } - } - - @Override - public void nextIntVector(int items, int[] vector, boolean[] isNull) - throws IOException - { - for (int i = 0; i < items; i++) { - if (!isNull[i]) { - vector[i] = Ints.checkedCast(next()); - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java deleted file mode 100644 index 54472236d8..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcInputStream.java +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.base.MoreObjects; -import org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException; -import org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind; -import io.airlift.slice.BasicSliceInput; -import io.airlift.slice.Slice; -import io.airlift.slice.Slices; -import org.iq80.snappy.Snappy; - -import java.io.IOException; -import java.io.InputStream; -import java.util.zip.DataFormatException; -import java.util.zip.Inflater; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; -import static org.apache.tajo.storage.thirdparty.orc.checkpoint.InputStreamCheckpoint.*; -import static org.apache.tajo.storage.thirdparty.orc.metadata.CompressionKind.*; -import static io.airlift.slice.Slices.EMPTY_SLICE; -import static sun.misc.Unsafe.ARRAY_BYTE_BASE_OFFSET; - -public final class OrcInputStream - extends InputStream -{ - public static final int BLOCK_HEADER_SIZE = 3; - - private final String source; - private final BasicSliceInput compressedSliceInput; - private final CompressionKind compressionKind; - private final int bufferSize; - - private int currentCompressedBlockOffset; - private BasicSliceInput current; - - private Slice buffer; - - public OrcInputStream(String source, BasicSliceInput sliceInput, CompressionKind compressionKind, int bufferSize) - { - this.source = checkNotNull(source, "source is null"); - - checkNotNull(sliceInput, "sliceInput is null"); - - this.compressionKind = checkNotNull(compressionKind, "compressionKind is null"); - this.bufferSize = bufferSize; - - if (compressionKind == UNCOMPRESSED) { - this.current = sliceInput; - this.compressedSliceInput = EMPTY_SLICE.getInput(); - } - else { - checkArgument(compressionKind == SNAPPY || compressionKind == ZLIB, "%s compression not supported", compressionKind); - this.compressedSliceInput = checkNotNull(sliceInput, "compressedSliceInput is null"); - this.current = EMPTY_SLICE.getInput(); - } - } - - @Override - public void close() - throws IOException - { - current = null; - } - - @Override - public int available() - throws IOException - { - if (current == null) { - return 0; - } - return current.available(); - } - - @Override - public boolean markSupported() - { - return false; - } - - @Override - public int read() - throws IOException - { - if (current == null) { - return -1; - } - - int result = current.read(); - if (result != -1) { - return result; - } - - advance(); - return read(); - } - - @Override - public int read(byte[] b, int off, int length) - throws IOException - { - if (current == null) { - return -1; - } - - if (!current.isReadable()) { - advance(); - if (current == null) { - return -1; - } - } - - return current.read(b, off, length); - } - - public long getCheckpoint() - { - // if the decompressed buffer is empty, return a checkpoint starting at the next block - if (current == null || (current.position() == 0 && current.available() == 0)) { - return createInputStreamCheckpoint(compressedSliceInput.position(), 0); - } - // otherwise return a checkpoint at the last compressed block read and the current position in the buffer - return createInputStreamCheckpoint(currentCompressedBlockOffset, current.position()); - } - - public boolean seekToCheckpoint(long checkpoint) - throws IOException - { - int compressedBlockOffset = decodeCompressedBlockOffset(checkpoint); - int decompressedOffset = decodeDecompressedOffset(checkpoint); - boolean discardedBuffer; - if (compressedBlockOffset != currentCompressedBlockOffset) { - verifyFormat(compressionKind != UNCOMPRESSED, "Reset stream has a compressed block offset but stream is not compressed"); - compressedSliceInput.setPosition(compressedBlockOffset); - current = EMPTY_SLICE.getInput(); - discardedBuffer = true; - } - else { - discardedBuffer = false; - } - - if (decompressedOffset != current.position()) { - current.setPosition(0); - if (current.available() < decompressedOffset) { - decompressedOffset -= current.available(); - advance(); - } - current.setPosition(decompressedOffset); - } - return discardedBuffer; - } - - @Override - public long skip(long n) - throws IOException - { - if (current == null || n <= 0) { - return -1; - } - - long result = current.skip(n); - if (result != 0) { - return result; - } - if (read() == -1) { - return 0; - } - return 1 + current.skip(n - 1); - } - - // This comes from the Apache Hive ORC code - private void advance() - throws IOException - { - if (compressedSliceInput == null || compressedSliceInput.available() == 0) { - current = null; - return; - } - - // 3 byte header - // NOTE: this must match BLOCK_HEADER_SIZE - currentCompressedBlockOffset = compressedSliceInput.position(); - int b0 = compressedSliceInput.readUnsignedByte(); - int b1 = compressedSliceInput.readUnsignedByte(); - int b2 = compressedSliceInput.readUnsignedByte(); - - boolean isUncompressed = (b0 & 0x01) == 1; - int chunkLength = (b2 << 15) | (b1 << 7) | (b0 >>> 1); - - Slice chunk = compressedSliceInput.readSlice(chunkLength); - - if (isUncompressed) { - current = chunk.getInput(); - } - else { - if (buffer == null) { - buffer = Slices.allocate(bufferSize); - } - - int uncompressedSize; - if (compressionKind == ZLIB) { - uncompressedSize = decompressZip(chunk, buffer); - } - else { - uncompressedSize = decompressSnappy(chunk, buffer); - } - - current = buffer.slice(0, uncompressedSize).getInput(); - } - } - - @Override - public String toString() - { - return MoreObjects.toStringHelper(this) - .add("source", source) - .add("compressedOffset", compressedSliceInput.position()) - .add("uncompressedOffset", current == null ? null : current.position()) - .add("compression", compressionKind) - .toString(); - } - - // This comes from the Apache Hive ORC code - private static int decompressZip(Slice in, Slice buffer) - throws IOException - { - byte[] outArray = (byte[]) buffer.getBase(); - int outOffset = 0; - - byte[] inArray = (byte[]) in.getBase(); - int inOffset = (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET); - int inLength = in.length(); - - Inflater inflater = new Inflater(true); - inflater.setInput(inArray, inOffset, inLength); - while (!(inflater.finished() || inflater.needsDictionary() || inflater.needsInput())) { - try { - int count = inflater.inflate(outArray, outOffset, outArray.length - outOffset); - outOffset += count; - } - catch (DataFormatException e) { - throw new OrcCorruptionException(e, "Invalid compressed stream"); - } - } - inflater.end(); - return outOffset; - } - - private static int decompressSnappy(Slice in, Slice buffer) - throws IOException - { - byte[] outArray = (byte[]) buffer.getBase(); - - byte[] inArray = (byte[]) in.getBase(); - int inOffset = (int) (in.getAddress() - ARRAY_BYTE_BASE_OFFSET); - int inLength = in.length(); - - return Snappy.uncompress(inArray, inOffset, inLength, outArray, 0); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java deleted file mode 100644 index 2f04155d6c..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/OrcStreamUtils.java +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import java.io.IOException; -import java.io.InputStream; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.OrcCorruptionException.verifyFormat; - -final class OrcStreamUtils -{ - public static final int MIN_REPEAT_SIZE = 3; - - private OrcStreamUtils() - { - } - - public static void skipFully(InputStream input, long length) - throws IOException - { - while (length > 0) { - long result = input.skip(length); - verifyFormat(result >= 0, "Unexpected end of stream"); - length -= result; - } - } - - public static void readFully(InputStream input, byte[] buffer, int offset, int length) - throws IOException - { - while (offset < length) { - int result = input.read(buffer, offset, length - offset); - verifyFormat(result >= 0, "Unexpected end of stream"); - offset += result; - } - } - - static B checkType(A value, Class target, String name) - { - checkNotNull(value, "%s is null", name); - checkArgument(target.isInstance(value), - "%s must be of type %s, not %s", - name, - target.getName(), - value.getClass().getName()); - return target.cast(value); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java deleted file mode 100644 index e03dbbbae1..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/stream/StreamSources.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc.stream; - -import com.google.common.collect.ImmutableMap; -import org.apache.tajo.storage.thirdparty.orc.StreamDescriptor; -import org.apache.tajo.storage.thirdparty.orc.StreamId; -import org.apache.tajo.storage.thirdparty.orc.metadata.Stream.StreamKind; - -import javax.annotation.Nonnull; -import java.util.Map; - -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; -import static org.apache.tajo.storage.thirdparty.orc.stream.MissingStreamSource.missingStreamSource; - -public class StreamSources -{ - private final Map> streamSources; - - public StreamSources(Map> streamSources) - { - this.streamSources = ImmutableMap.copyOf(checkNotNull(streamSources, "streamSources is null")); - } - - @Nonnull - public > StreamSource getStreamSource(StreamDescriptor streamDescriptor, StreamKind streamKind, Class streamType) - { - checkNotNull(streamDescriptor, "streamDescriptor is null"); - checkNotNull(streamType, "streamType is null"); - - StreamSource streamSource = streamSources.get(new StreamId(streamDescriptor.getStreamId(), streamKind)); - if (streamSource == null) { - streamSource = missingStreamSource(streamType); - } - - checkArgument(streamType.isAssignableFrom(streamSource.getStreamType()), - "%s must be of type %s, not %s", - streamDescriptor, - streamType.getName(), - streamSource.getStreamType().getName()); - - return (StreamSource) streamSource; - } -} From e279d21567554b1e9c11d19bbebfca65c0b58007 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Tue, 26 May 2015 14:01:52 +0900 Subject: [PATCH 079/141] Remove useless maven dependancies --- tajo-storage/tajo-storage-hdfs/pom.xml | 63 -------------------------- 1 file changed, 63 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index 7a63f67dc1..1dbfc0622b 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -352,74 +352,11 @@ net.minidev json-smart - - io.airlift - slice - 0.7 - - - io.airlift - units - 0.97 - - - com.google.guava - guava - 18.0 - - - joda-time - joda-time - 2.4 - - - org.iq80.snappy - snappy - 0.2 - - - com.facebook.presto.hive - hive-apache - 0.9 - - - org.jetbrains - annotations - 13.0 - - - com.fasterxml.jackson.core - jackson-core - 2.4.2 - com.facebook.presto presto-orc 0.86 - - com.facebook.hive - hive-dwrf - 0.8 - - - commons-logging - commons-logging - - - org.iq80.snappy - snappy - - - com.facebook.presto.hadoop - hadoop-cdh4 - - - it.unimi.dsi - fastutil - - - From 7ff190f83936177c4911914a4c5944ef1baf3d20 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 5 Jun 2015 16:08:19 +0900 Subject: [PATCH 080/141] [Deprecated] Use store type instead of type string --- .../test/java/org/apache/tajo/storage/orc/TestOrcScanner.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java index 8b60b9c2c7..3a61268bfa 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java @@ -24,6 +24,7 @@ import org.apache.hadoop.fs.Path; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.catalog.proto.CatalogProtos; import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.conf.TajoConf; import org.apache.tajo.datum.TimestampDatum; @@ -66,7 +67,7 @@ public void setup() throws IOException { Configuration conf = new TajoConf(); - TableMeta meta = new TableMeta("ORC", new KeyValueSet()); + TableMeta meta = new TableMeta(CatalogProtos.StoreType.ORCFILE, new KeyValueSet()); Fragment fragment = getFileFragment(conf, "u_data_20.orc"); From 4eb1252e00578ae0ecbdf7437c9d486fc33f79b7 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 5 Jun 2015 23:37:50 +0900 Subject: [PATCH 081/141] Make it projectable --- .../apache/tajo/storage/orc/OrcScanner.java | 44 ++++++++++++------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index d72c968fc1..a1958c536c 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -83,6 +83,16 @@ private Vector createOrcVector(TajoDataTypes.Type type) { private FileSystem fs; private FSDataInputStream fis; + private static class ColumnInfo { + TajoDataTypes.Type type; + int id; + } + + /** + * Temporary array for caching column info + */ + private ColumnInfo [] targetColInfo; + @Override public void init() throws IOException { OrcReader orcReader; @@ -110,16 +120,23 @@ public void init() throws IOException { fs.getFileStatus(path).getLen(), 100000000); + targetColInfo = new ColumnInfo[targets.length]; + for (int i=0; i columnSet = new HashSet(); - for (int i=0; i Date: Fri, 5 Jun 2015 23:38:54 +0900 Subject: [PATCH 082/141] [Deprecated] modify storage-default.xml for old version --- .../src/main/resources/storage-default.xml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml index 6b5143d6b0..147879db36 100644 --- a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml @@ -39,7 +39,7 @@ tajo.storage.scanner-handler - text,csv,json,raw,rcfile,row,parquet,sequencefile,avro,hbase + text,csv,json,raw,rcfile,row,parquet,orcfile,sequencefile,avro,hbase @@ -71,6 +71,10 @@ tajo.storage.fragment.parquet.class org.apache.tajo.storage.fragment.FileFragment + + tajo.storage.fragment.orcfile.class + org.apache.tajo.storage.fragment.FileFragment + tajo.storage.fragment.sequencefile.class org.apache.tajo.storage.fragment.FileFragment @@ -121,7 +125,7 @@ - tajo.storage.scanner-handler.orc.class + tajo.storage.scanner-handler.orcfile.class org.apache.tajo.storage.orc.OrcScanner From 5f617162f19e4842d4fe7c485d796f93941edc04 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 17 Jun 2015 16:36:45 +0900 Subject: [PATCH 083/141] Split length should be file size unless it is not splittable --- .../java/org/apache/tajo/storage/orc/OrcScanner.java | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index a1958c536c..3541394256 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -33,7 +33,6 @@ import org.apache.tajo.storage.VTuple; import org.apache.tajo.storage.fragment.Fragment; import com.facebook.presto.orc.*; -import com.facebook.presto.orc.metadata.ColumnStatistics; import com.facebook.presto.orc.metadata.OrcMetadataReader; import org.apache.tajo.storage.thirdparty.orc.HdfsOrcDataSource; import org.apache.tajo.util.datetime.DateTimeUtil; @@ -41,7 +40,6 @@ import java.io.IOException; import java.util.HashSet; -import java.util.Map; import java.util.Set; /** @@ -144,13 +142,8 @@ public void init() throws IOException { // TODO: make OrcPredicate useful // TODO: TimeZone should be from conf // TODO: it might be splittable - recordReader = orcReader.createRecordReader(columnSet, new OrcPredicate() { - @Override - public boolean matches(long numberOfRows, Map statisticsByColumnIndex) { - return true; - } - }, - 0, 1024, DateTimeZone.getDefault()); + recordReader = orcReader.createRecordReader(columnSet, OrcPredicate.TRUE, + 0, orcDataSource.getSize(), DateTimeZone.getDefault()); getNextBatch(); } From 512566b727ab7c32a1e10fb4c4c3b84c764d8221 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 17 Jun 2015 17:05:44 +0900 Subject: [PATCH 084/141] make splittable --- .../java/org/apache/tajo/storage/orc/OrcScanner.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 3541394256..987994c595 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -18,6 +18,8 @@ package org.apache.tajo.storage.orc; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; @@ -46,6 +48,7 @@ * OrcScanner for reading ORC files */ public class OrcScanner extends FileScanner { + private static final Log LOG = LogFactory.getLog(OrcScanner.class); private OrcRecordReader recordReader; private Vector [] vectors; private int currentPosInBatch = 0; @@ -143,7 +146,11 @@ public void init() throws IOException { // TODO: TimeZone should be from conf // TODO: it might be splittable recordReader = orcReader.createRecordReader(columnSet, OrcPredicate.TRUE, - 0, orcDataSource.getSize(), DateTimeZone.getDefault()); + fragment.getStartKey(), fragment.getLength(), DateTimeZone.getDefault()); + + LOG.debug("file fragment { path: " + fragment.getPath() + + ", start offset: " + fragment.getStartKey() + + ", length: " + fragment.getLength() + "}"); getNextBatch(); } @@ -253,6 +260,6 @@ public boolean isSelectable() { @Override public boolean isSplittable() { - return false; + return true; } } From 9a2d164c07a6c0d601ce87038d124f00d065f933 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 17 Jun 2015 17:53:47 +0900 Subject: [PATCH 085/141] 'orcfile' to 'orc' --- .../src/main/resources/storage-default.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml index 147879db36..ea427786c9 100644 --- a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml @@ -39,7 +39,7 @@ tajo.storage.scanner-handler - text,csv,json,raw,rcfile,row,parquet,orcfile,sequencefile,avro,hbase + text,csv,json,raw,rcfile,row,parquet,orc,sequencefile,avro,hbase @@ -72,7 +72,7 @@ org.apache.tajo.storage.fragment.FileFragment - tajo.storage.fragment.orcfile.class + tajo.storage.fragment.orc.class org.apache.tajo.storage.fragment.FileFragment @@ -125,7 +125,7 @@ - tajo.storage.scanner-handler.orcfile.class + tajo.storage.scanner-handler.orc.class org.apache.tajo.storage.orc.OrcScanner From f28c081a1bfba695b22cd8122c286562daabd12d Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 17 Jun 2015 18:18:42 +0900 Subject: [PATCH 086/141] 'orcfile' to 'orc' #2 --- .../src/main/java/org/apache/tajo/catalog/CatalogUtil.java | 4 ++-- .../tajo-catalog-common/src/main/proto/CatalogProtos.proto | 2 +- .../test/java/org/apache/tajo/storage/orc/TestOrcScanner.java | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java index d2b8e67d10..573faf54b3 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java +++ b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java @@ -279,8 +279,8 @@ public static StoreType getStoreType(final String typeStr) { return StoreType.ROWFILE; } else if (typeStr.equalsIgnoreCase(StoreType.RCFILE.name())) { return StoreType.RCFILE; - } else if (typeStr.equalsIgnoreCase(StoreType.ORCFILE.name())) { - return StoreType.ORCFILE; + } else if (typeStr.equalsIgnoreCase(StoreType.ORC.name())) { + return StoreType.ORC; } else if (typeStr.equalsIgnoreCase(StoreType.PARQUET.name())) { return StoreType.PARQUET; } else if (typeStr.equalsIgnoreCase(StoreType.SEQUENCEFILE.name())) { diff --git a/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto b/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto index b2b690e4a7..5b49c92a0e 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto +++ b/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto @@ -32,7 +32,7 @@ enum StoreType { RCFILE = 3; ROWFILE = 4; HCFILE = 5; - ORCFILE = 6; + ORC = 6; PARQUET = 7; SEQUENCEFILE = 8; AVRO = 9; diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java index 3a61268bfa..19a6f5c521 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java @@ -67,7 +67,7 @@ public void setup() throws IOException { Configuration conf = new TajoConf(); - TableMeta meta = new TableMeta(CatalogProtos.StoreType.ORCFILE, new KeyValueSet()); + TableMeta meta = new TableMeta(CatalogProtos.StoreType.ORC, new KeyValueSet()); Fragment fragment = getFileFragment(conf, "u_data_20.orc"); From 7275f8a534fd0be7013c26810ff542cb28f844ec Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 8 Jul 2015 11:03:30 +0900 Subject: [PATCH 087/141] max-merge-distance option added --- .../main/java/org/apache/tajo/storage/StorageConstants.java | 2 ++ .../src/main/java/org/apache/tajo/storage/orc/OrcScanner.java | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java index d2c6c1c254..3f87bf82f0 100644 --- a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java +++ b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java @@ -72,6 +72,8 @@ public class StorageConstants { public static final String DEFAULT_BINARY_SERDE = "org.apache.tajo.storage.BinarySerializerDeserializer"; public static final String DEFAULT_TEXT_SERDE = "org.apache.tajo.storage.TextSerializerDeserializer"; + public static final String ORC_MAX_MERGE_DISTANCE = "orc.max.merge.distance"; + public static final String DEFAULT_ORC_MAX_MERGE_DISTANCE = "1048576"; // 1MB // Parquet file properties ------------------------------------------------- public static final String PARQUET_DEFAULT_BLOCK_SIZE; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 987994c595..5c7a6eb98c 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -31,6 +31,7 @@ import org.apache.tajo.datum.*; import org.apache.tajo.exception.UnsupportedException; import org.apache.tajo.storage.FileScanner; +import org.apache.tajo.storage.StorageConstants; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.VTuple; import org.apache.tajo.storage.fragment.Fragment; @@ -119,7 +120,8 @@ public void init() throws IOException { this.fragment.getPath().toString(), fis, fs.getFileStatus(path).getLen(), - 100000000); + Integer.parseInt(meta.getOption(StorageConstants.ORC_MAX_MERGE_DISTANCE, + StorageConstants.DEFAULT_ORC_MAX_MERGE_DISTANCE))); targetColInfo = new ColumnInfo[targets.length]; for (int i=0; i Date: Wed, 8 Jul 2015 11:05:22 +0900 Subject: [PATCH 088/141] TODO line removed(it's done) --- .../src/main/java/org/apache/tajo/storage/orc/OrcScanner.java | 1 - 1 file changed, 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 5c7a6eb98c..749e15bca9 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -115,7 +115,6 @@ public void init() throws IOException { fis = fs.open(path); } - // TODO: max merge distance should be fetched from conf OrcDataSource orcDataSource = new HdfsOrcDataSource( this.fragment.getPath().toString(), fis, From 9c98fa5a46c158a1238f8c74ff1ada5d8ee95f79 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 9 Jul 2015 17:41:00 +0900 Subject: [PATCH 089/141] Date type added --- .../main/java/org/apache/tajo/util/datetime/DateTimeUtil.java | 2 ++ .../src/main/java/org/apache/tajo/storage/orc/OrcScanner.java | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java b/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java index f1c7970d6a..3c79cf7ff2 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java @@ -40,6 +40,8 @@ public class DateTimeUtil { /** maximum possible number of fields in a date * string */ private static int MAXDATEFIELDS = 25; + public final static int DAYS_FROM_JULIAN_TO_EPOCH = 2440588; + public static boolean isJulianCalendar(int year, int month, int day) { return year <= 1752 && month <= 9 && day < 14; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 749e15bca9..3e214f72db 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -64,6 +64,7 @@ private Vector createOrcVector(TajoDataTypes.Type type) { case INT1: case INT2: case INT4: case INT8: case UINT1: case UINT2: case UINT4: case UINT8: case TIMESTAMP: + case DATE: return new LongVector(); case FLOAT4: @@ -213,6 +214,9 @@ private Datum createValueDatum(Vector vector, TajoDataTypes.Type type) { case TIMESTAMP: return new TimestampDatum(DateTimeUtil.javaTimeToJulianTime(((LongVector) vector).vector[currentPosInBatch])); + case DATE: + return new DateDatum((int)((LongVector)vector).vector[currentPosInBatch] + DateTimeUtil.DAYS_FROM_JULIAN_TO_EPOCH); + default: throw new UnsupportedException("This data type is not supported currently: "+type.toString()); } From 578718a67a094f4be00e2930f5dc8602138cbdf0 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 5 Jun 2015 23:37:50 +0900 Subject: [PATCH 090/141] Make it projectable --- .../apache/tajo/storage/orc/OrcScanner.java | 44 ++++++++++++------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 8e3c474dc9..6292ffb430 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -85,6 +85,16 @@ private Vector createOrcVector(TajoDataTypes.Type type) { private FileSystem fs; private FSDataInputStream fis; + private static class ColumnInfo { + TajoDataTypes.Type type; + int id; + } + + /** + * Temporary array for caching column info + */ + private ColumnInfo [] targetColInfo; + @Override public void init() throws IOException { OrcReader orcReader; @@ -113,16 +123,23 @@ public void init() throws IOException { Integer.parseInt(meta.getOption(StorageConstants.ORC_MAX_MERGE_DISTANCE, StorageConstants.DEFAULT_ORC_MAX_MERGE_DISTANCE))); + targetColInfo = new ColumnInfo[targets.length]; + for (int i=0; i columnSet = new HashSet(); - for (int i=0; i Date: Wed, 17 Jun 2015 16:36:45 +0900 Subject: [PATCH 091/141] Split length should be file size unless it is not splittable --- .../java/org/apache/tajo/storage/orc/OrcScanner.java | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 6292ffb430..e6fdf3a0ea 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -35,7 +35,6 @@ import org.apache.tajo.storage.VTuple; import org.apache.tajo.storage.fragment.Fragment; import com.facebook.presto.orc.*; -import com.facebook.presto.orc.metadata.ColumnStatistics; import com.facebook.presto.orc.metadata.OrcMetadataReader; import org.apache.tajo.storage.thirdparty.orc.HdfsOrcDataSource; import org.apache.tajo.util.datetime.DateTimeUtil; @@ -43,7 +42,6 @@ import java.io.IOException; import java.util.HashSet; -import java.util.Map; import java.util.Set; /** @@ -147,13 +145,8 @@ public void init() throws IOException { // TODO: make OrcPredicate useful // TODO: TimeZone should be from conf // TODO: it might be splittable - recordReader = orcReader.createRecordReader(columnSet, new OrcPredicate() { - @Override - public boolean matches(long numberOfRows, Map statisticsByColumnIndex) { - return true; - } - }, - 0, 1024, DateTimeZone.getDefault()); + recordReader = orcReader.createRecordReader(columnSet, OrcPredicate.TRUE, + 0, orcDataSource.getSize(), DateTimeZone.getDefault()); getNextBatch(); } From 462fafe0332efe067a46a3d99a704fad77a39b5c Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 17 Jun 2015 17:05:44 +0900 Subject: [PATCH 092/141] make splittable --- .../java/org/apache/tajo/storage/orc/OrcScanner.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index e6fdf3a0ea..77d2de94d5 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -18,6 +18,8 @@ package org.apache.tajo.storage.orc; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; @@ -48,6 +50,7 @@ * OrcScanner for reading ORC files */ public class OrcScanner extends FileScanner { + private static final Log LOG = LogFactory.getLog(OrcScanner.class); private OrcRecordReader recordReader; private Vector [] vectors; private int currentPosInBatch = 0; @@ -146,7 +149,11 @@ public void init() throws IOException { // TODO: TimeZone should be from conf // TODO: it might be splittable recordReader = orcReader.createRecordReader(columnSet, OrcPredicate.TRUE, - 0, orcDataSource.getSize(), DateTimeZone.getDefault()); + fragment.getStartKey(), fragment.getLength(), DateTimeZone.getDefault()); + + LOG.debug("file fragment { path: " + fragment.getPath() + + ", start offset: " + fragment.getStartKey() + + ", length: " + fragment.getLength() + "}"); getNextBatch(); } @@ -261,6 +268,6 @@ public void setFilter(EvalNode filter) { @Override public boolean isSplittable() { - return false; + return true; } } From b94daf6d682c086c0c802e38cc72e28c100c24be Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 8 Jul 2015 11:05:22 +0900 Subject: [PATCH 093/141] TODO line removed(it's done) --- .../src/main/java/org/apache/tajo/storage/orc/OrcScanner.java | 1 - 1 file changed, 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 77d2de94d5..520e938cb0 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -116,7 +116,6 @@ public void init() throws IOException { fis = fs.open(path); } - // TODO: max merge distance should be fetched from conf OrcDataSource orcDataSource = new HdfsOrcDataSource( this.fragment.getPath().toString(), fis, From e133638ad65a59e287e8f869be0b1f42a76e183a Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 9 Jul 2015 17:41:00 +0900 Subject: [PATCH 094/141] Date type added --- .../main/java/org/apache/tajo/util/datetime/DateTimeUtil.java | 2 ++ .../src/main/java/org/apache/tajo/storage/orc/OrcScanner.java | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java b/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java index 570873d1ab..5a338d39a5 100644 --- a/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java +++ b/tajo-common/src/main/java/org/apache/tajo/util/datetime/DateTimeUtil.java @@ -40,6 +40,8 @@ public class DateTimeUtil { /** maximum possible number of fields in a date * string */ private static int MAXDATEFIELDS = 25; + public final static int DAYS_FROM_JULIAN_TO_EPOCH = 2440588; + public static boolean isJulianCalendar(int year, int month, int day) { return year <= 1752 && month <= 9 && day < 14; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 520e938cb0..82cffa6cef 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -65,6 +65,7 @@ private Vector createOrcVector(TajoDataTypes.Type type) { case INT1: case INT2: case INT4: case INT8: case UINT1: case UINT2: case UINT4: case UINT8: case TIMESTAMP: + case DATE: return new LongVector(); case FLOAT4: @@ -214,6 +215,9 @@ private Datum createValueDatum(Vector vector, TajoDataTypes.Type type) { case TIMESTAMP: return new TimestampDatum(DateTimeUtil.javaTimeToJulianTime(((LongVector) vector).vector[currentPosInBatch])); + case DATE: + return new DateDatum((int)((LongVector)vector).vector[currentPosInBatch] + DateTimeUtil.DAYS_FROM_JULIAN_TO_EPOCH); + default: throw new UnsupportedException("This data type is not supported currently: "+type.toString()); } From b37204aa0475e1a6f5ad8f66114a55c1ad79d88f Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Mon, 13 Jul 2015 12:13:02 +0900 Subject: [PATCH 095/141] add orc to storage-default.xml --- .../src/main/resources/storage-default.xml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml index 8bdd36ff10..45b3af6ed2 100644 --- a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml @@ -39,7 +39,7 @@ tajo.storage.scanner-handler - text,csv,json,raw,rcfile,row,parquet,sequencefile,avro,hbase + text,csv,json,raw,rcfile,row,parquet,orc,sequencefile,avro,hbase @@ -71,6 +71,10 @@ tajo.storage.fragment.parquet.class org.apache.tajo.storage.fragment.FileFragment + + tajo.storage.fragment.orc.class + org.apache.tajo.storage.fragment.FileFragment + tajo.storage.fragment.sequencefile.class org.apache.tajo.storage.fragment.FileFragment From 08b88c171c20849cb716f66bcf19d5fbe15375f2 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Mon, 13 Jul 2015 12:20:54 +0900 Subject: [PATCH 096/141] Class name changed to 'ORCScanner' --- .../src/main/resources/storage-default.xml | 2 +- .../main/java/org/apache/tajo/storage/orc/OrcScanner.java | 6 +++--- .../java/org/apache/tajo/storage/orc/TestOrcScanner.java | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml index 45b3af6ed2..eb72110ee2 100644 --- a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml @@ -126,7 +126,7 @@ tajo.storage.scanner-handler.orc.class - org.apache.tajo.storage.orc.OrcScanner + org.apache.tajo.storage.orc.ORCScanner diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java index 82cffa6cef..54406f9eab 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java @@ -49,14 +49,14 @@ /** * OrcScanner for reading ORC files */ -public class OrcScanner extends FileScanner { - private static final Log LOG = LogFactory.getLog(OrcScanner.class); +public class ORCScanner extends FileScanner { + private static final Log LOG = LogFactory.getLog(ORCScanner.class); private OrcRecordReader recordReader; private Vector [] vectors; private int currentPosInBatch = 0; private int batchSize = 0; - public OrcScanner(Configuration conf, final Schema schema, final TableMeta meta, final Fragment fragment) { + public ORCScanner(Configuration conf, final Schema schema, final TableMeta meta, final Fragment fragment) { super(conf, schema, meta, fragment); } diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java index 8264003a95..b4117931fe 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java @@ -40,8 +40,8 @@ import java.io.IOException; import java.net.URL; -public class TestOrcScanner { - private OrcScanner orcScanner; +public class TestORCScanner { + private ORCScanner orcScanner; public static Path getResourcePath(String path, String suffix) { URL resultBaseURL = ClassLoader.getSystemResource(path); @@ -70,7 +70,7 @@ public void setup() throws IOException { Fragment fragment = getFileFragment(conf, "u_data_20.orc"); - orcScanner = new OrcScanner(conf, schema, meta, fragment); + orcScanner = new ORCScanner(conf, schema, meta, fragment); orcScanner.init(); } From b5b5871cb29dad875b05bebcf0e54646e0957c0a Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Mon, 13 Jul 2015 12:25:28 +0900 Subject: [PATCH 097/141] file names changed --- .../apache/tajo/storage/orc/{OrcScanner.java => ORCScanner.java} | 0 .../tajo/storage/orc/{TestOrcScanner.java => TestORCScanner.java} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/{OrcScanner.java => ORCScanner.java} (100%) rename tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/{TestOrcScanner.java => TestORCScanner.java} (100%) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java similarity index 100% rename from tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcScanner.java rename to tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestORCScanner.java similarity index 100% rename from tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrcScanner.java rename to tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestORCScanner.java From 8c915e7e8e388dcb49620da6e32823d569e7c2e1 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 16 Jul 2015 14:54:34 +0900 Subject: [PATCH 098/141] null check --- .../org/apache/tajo/catalog/CatalogUtil.java | 4 +- .../apache/tajo/storage/orc/ORCScanner.java | 88 +++++++++++++++---- 2 files changed, 72 insertions(+), 20 deletions(-) diff --git a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java index c896aa8999..aa700192ee 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java +++ b/tajo-catalog/tajo-catalog-common/src/main/java/org/apache/tajo/catalog/CatalogUtil.java @@ -295,8 +295,8 @@ public static StoreType getStoreType(final String typeStr) { return StoreType.ROWFILE; } else if (typeStr.equalsIgnoreCase(StoreType.RCFILE.name())) { return StoreType.RCFILE; - } else if (typeStr.equalsIgnoreCase(StoreType.ORCFILE.name())) { - return StoreType.ORCFILE; + } else if (typeStr.equalsIgnoreCase(StoreType.ORC.name())) { + return StoreType.ORC; } else if (typeStr.equalsIgnoreCase(StoreType.PARQUET.name())) { return StoreType.PARQUET; } else if (typeStr.equalsIgnoreCase(StoreType.SEQUENCEFILE.name())) { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java index 54406f9eab..ab74cdfc4a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java @@ -18,6 +18,7 @@ package org.apache.tajo.storage.orc; +import com.google.protobuf.InvalidProtocolBufferException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -60,10 +61,11 @@ public ORCScanner(Configuration conf, final Schema schema, final TableMeta meta, super(conf, schema, meta, fragment); } - private Vector createOrcVector(TajoDataTypes.Type type) { - switch (type) { + private Vector createOrcVector(TajoDataTypes.DataType type) { + switch (type.getType()) { case INT1: case INT2: case INT4: case INT8: case UINT1: case UINT2: case UINT4: case UINT8: + case INET4: case TIMESTAMP: case DATE: return new LongVector(); @@ -73,10 +75,13 @@ private Vector createOrcVector(TajoDataTypes.Type type) { return new DoubleVector(); case BOOLEAN: + case NULL_TYPE: return new BooleanVector(); case BLOB: case TEXT: + case CHAR: + case PROTOBUF: return new SliceVector(); default: @@ -88,7 +93,7 @@ private Vector createOrcVector(TajoDataTypes.Type type) { private FSDataInputStream fis; private static class ColumnInfo { - TajoDataTypes.Type type; + TajoDataTypes.DataType type; int id; } @@ -127,7 +132,7 @@ public void init() throws IOException { targetColInfo = new ColumnInfo[targets.length]; for (int i=0; i Date: Fri, 3 Jul 2015 15:43:25 +0900 Subject: [PATCH 099/141] Initial importing --- tajo-storage/tajo-storage-hdfs/pom.xml | 1 + .../orc/BinaryColumnStatistics.java | 25 + .../thirdparty/orc/BitFieldWriter.java | 69 + .../storage/thirdparty/orc/BloomFilterIO.java | 42 + .../orc/BooleanColumnStatistics.java | 27 + .../thirdparty/orc/ColumnStatistics.java | 36 + .../thirdparty/orc/ColumnStatisticsImpl.java | 1022 +++++++ .../thirdparty/orc/CompressionCodec.java | 68 + .../thirdparty/orc/CompressionKind.java | 27 + .../thirdparty/orc/DateColumnStatistics.java | 37 + .../orc/DecimalColumnStatistics.java | 45 + .../orc/DirectDecompressionCodec.java | 26 + .../orc/DoubleColumnStatistics.java | 44 + .../thirdparty/orc/DynamicByteArray.java | 303 ++ .../thirdparty/orc/DynamicIntArray.java | 142 + .../orc/IntegerColumnStatistics.java | 50 + .../storage/thirdparty/orc/IntegerWriter.java | 47 + .../storage/thirdparty/orc/MemoryManager.java | 188 ++ .../tajo/storage/thirdparty/orc/Metadata.java | 45 + .../tajo/storage/thirdparty/orc/OrcConf.java | 149 + .../tajo/storage/thirdparty/orc/OrcFile.java | 443 +++ .../thirdparty/orc/OrcFileKeyWrapper.java | 114 + .../thirdparty/orc/OrcFileValueWrapper.java | 92 + .../thirdparty/orc/OrcNewOutputFormat.java | 78 + .../thirdparty/orc/OrcOutputFormat.java | 189 ++ .../tajo/storage/thirdparty/orc/OrcSerde.java | 156 + .../storage/thirdparty/orc/OrcStruct.java | 607 ++++ .../tajo/storage/thirdparty/orc/OrcUnion.java | 160 ++ .../tajo/storage/thirdparty/orc/OrcUtils.java | 201 ++ .../storage/thirdparty/orc/OutStream.java | 286 ++ .../thirdparty/orc/PositionProvider.java | 26 + .../thirdparty/orc/PositionRecorder.java | 25 + .../orc/PositionedOutputStream.java | 38 + .../storage/thirdparty/orc/RedBlackTree.java | 309 ++ .../thirdparty/orc/RunLengthByteWriter.java | 106 + .../orc/RunLengthIntegerWriter.java | 143 + .../orc/RunLengthIntegerWriterV2.java | 832 ++++++ .../thirdparty/orc/SerializationUtils.java | 844 ++++++ .../storage/thirdparty/orc/SnappyCodec.java | 109 + .../storage/thirdparty/orc/StreamName.java | 95 + .../orc/StringColumnStatistics.java | 41 + .../thirdparty/orc/StringRedBlackTree.java | 202 ++ .../thirdparty/orc/StripeInformation.java | 59 + .../thirdparty/orc/StripeStatistics.java | 42 + .../orc/TimestampColumnStatistics.java | 38 + .../thirdparty/orc/VectorizedOrcSerde.java | 88 + .../tajo/storage/thirdparty/orc/Writer.java | 102 + .../storage/thirdparty/orc/WriterImpl.java | 2524 +++++++++++++++++ .../storage/thirdparty/orc/ZlibCodec.java | 169 ++ .../thirdparty/orc/util/BloomFilter.java | 291 ++ .../storage/thirdparty/orc/util/Murmur3.java | 334 +++ .../src/main/proto/orc_proto.proto | 217 ++ 52 files changed, 11353 insertions(+) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BinaryColumnStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BitFieldWriter.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BloomFilterIO.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanColumnStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionCodec.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionKind.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DateColumnStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DecimalColumnStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DirectDecompressionCodec.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleColumnStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicByteArray.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicIntArray.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/IntegerColumnStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/IntegerWriter.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MemoryManager.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Metadata.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcConf.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileKeyWrapper.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileValueWrapper.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcNewOutputFormat.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcOutputFormat.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcSerde.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcStruct.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUnion.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OutStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionProvider.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionRecorder.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionedOutputStream.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RedBlackTree.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthByteWriter.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriter.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriterV2.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/VectorizedOrcSerde.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/util/BloomFilter.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/util/Murmur3.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto diff --git a/tajo-storage/tajo-storage-hdfs/pom.xml b/tajo-storage/tajo-storage-hdfs/pom.xml index bfa5707996..6d7d230629 100644 --- a/tajo-storage/tajo-storage-hdfs/pom.xml +++ b/tajo-storage/tajo-storage-hdfs/pom.xml @@ -135,6 +135,7 @@ --proto_path=../../tajo-catalog/tajo-catalog-common/src/main/proto --java_out=target/generated-sources/proto src/main/proto/StorageFragmentProtos.proto + src/main/proto/orc_proto.proto diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BinaryColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BinaryColumnStatistics.java new file mode 100644 index 0000000000..bee29fb994 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BinaryColumnStatistics.java @@ -0,0 +1,25 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +/** + * Statistics for binary columns. + */ +public interface BinaryColumnStatistics extends ColumnStatistics { + long getSum(); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BitFieldWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BitFieldWriter.java new file mode 100644 index 0000000000..23719bd11e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BitFieldWriter.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import java.io.IOException; + +class BitFieldWriter { + private RunLengthByteWriter output; + private final int bitSize; + private byte current = 0; + private int bitsLeft = 8; + + BitFieldWriter(PositionedOutputStream output, + int bitSize) throws IOException { + this.output = new RunLengthByteWriter(output); + this.bitSize = bitSize; + } + + private void writeByte() throws IOException { + output.write(current); + current = 0; + bitsLeft = 8; + } + + void flush() throws IOException { + if (bitsLeft != 8) { + writeByte(); + } + output.flush(); + } + + void write(int value) throws IOException { + int bitsToWrite = bitSize; + while (bitsToWrite > bitsLeft) { + // add the bits to the bottom of the current word + current |= value >>> (bitsToWrite - bitsLeft); + // subtract out the bits we just added + bitsToWrite -= bitsLeft; + // zero out the bits above bitsToWrite + value &= (1 << bitsToWrite) - 1; + writeByte(); + } + bitsLeft -= bitsToWrite; + current |= value << bitsLeft; + if (bitsLeft == 0) { + writeByte(); + } + } + + void getPosition(PositionRecorder recorder) throws IOException { + output.getPosition(recorder); + recorder.addPosition(8 - bitsLeft); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BloomFilterIO.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BloomFilterIO.java new file mode 100644 index 0000000000..9d7c09cfb6 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BloomFilterIO.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.primitives.Longs; +import org.apache.tajo.storage.thirdparty.orc.util.BloomFilter; + +public class BloomFilterIO extends BloomFilter { + + public BloomFilterIO(long expectedEntries) { + super(expectedEntries, DEFAULT_FPP); + } + + public BloomFilterIO(long expectedEntries, double fpp) { + super(expectedEntries, fpp); + } + +/** + * Initializes the BloomFilter from the given Orc BloomFilter + */ + public BloomFilterIO(OrcProto.BloomFilter bloomFilter) { + this.bitSet = new BitSet(Longs.toArray(bloomFilter.getBitsetList())); + this.numHashFunctions = bloomFilter.getNumHashFunctions(); + this.numBits = (int) this.bitSet.bitSize(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanColumnStatistics.java new file mode 100644 index 0000000000..0f55697339 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/BooleanColumnStatistics.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +/** + * Statistics for boolean columns. + */ +public interface BooleanColumnStatistics extends ColumnStatistics { + long getFalseCount(); + + long getTrueCount(); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatistics.java new file mode 100644 index 0000000000..b317e41a42 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatistics.java @@ -0,0 +1,36 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +/** + * Statistics that are available for all types of columns. + */ +public interface ColumnStatistics { + /** + * Get the number of values in this column. It will differ from the number + * of rows because of NULL values and repeated values. + * @return the number of values + */ + long getNumberOfValues(); + + /** + * Returns true if there are nulls in the scope of column statistics. + * @return true if null present else false + */ + boolean hasNull(); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java new file mode 100644 index 0000000000..be2157a7cc --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java @@ -0,0 +1,1022 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; + +import java.sql.Date; +import java.sql.Timestamp; + +class ColumnStatisticsImpl implements ColumnStatistics { + + private static final class BooleanStatisticsImpl extends ColumnStatisticsImpl + implements BooleanColumnStatistics { + private long trueCount = 0; + + BooleanStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + OrcProto.BucketStatistics bkt = stats.getBucketStatistics(); + trueCount = bkt.getCount(0); + } + + BooleanStatisticsImpl() { + } + + @Override + void reset() { + super.reset(); + trueCount = 0; + } + + @Override + void updateBoolean(boolean value) { + if (value) { + trueCount += 1; + } + } + + @Override + void merge(ColumnStatisticsImpl other) { + if (other instanceof BooleanStatisticsImpl) { + BooleanStatisticsImpl bkt = (BooleanStatisticsImpl) other; + trueCount += bkt.trueCount; + } else { + if (isStatsExists() && trueCount != 0) { + throw new IllegalArgumentException("Incompatible merging of boolean column statistics"); + } + } + super.merge(other); + } + + @Override + OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder builder = super.serialize(); + OrcProto.BucketStatistics.Builder bucket = + OrcProto.BucketStatistics.newBuilder(); + bucket.addCount(trueCount); + builder.setBucketStatistics(bucket); + return builder; + } + + @Override + public long getFalseCount() { + return getNumberOfValues() - trueCount; + } + + @Override + public long getTrueCount() { + return trueCount; + } + + @Override + public String toString() { + return super.toString() + " true: " + trueCount; + } + } + + private static final class IntegerStatisticsImpl extends ColumnStatisticsImpl + implements IntegerColumnStatistics { + + private long minimum = Long.MAX_VALUE; + private long maximum = Long.MIN_VALUE; + private long sum = 0; + private boolean hasMinimum = false; + private boolean overflow = false; + + IntegerStatisticsImpl() { + } + + IntegerStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + OrcProto.IntegerStatistics intStat = stats.getIntStatistics(); + if (intStat.hasMinimum()) { + hasMinimum = true; + minimum = intStat.getMinimum(); + } + if (intStat.hasMaximum()) { + maximum = intStat.getMaximum(); + } + if (intStat.hasSum()) { + sum = intStat.getSum(); + } else { + overflow = true; + } + } + + @Override + void reset() { + super.reset(); + hasMinimum = false; + minimum = Long.MAX_VALUE; + maximum = Long.MIN_VALUE; + sum = 0; + overflow = false; + } + + @Override + void updateInteger(long value) { + if (!hasMinimum) { + hasMinimum = true; + minimum = value; + maximum = value; + } else if (value < minimum) { + minimum = value; + } else if (value > maximum) { + maximum = value; + } + if (!overflow) { + boolean wasPositive = sum >= 0; + sum += value; + if ((value >= 0) == wasPositive) { + overflow = (sum >= 0) != wasPositive; + } + } + } + + @Override + void merge(ColumnStatisticsImpl other) { + if (other instanceof IntegerStatisticsImpl) { + IntegerStatisticsImpl otherInt = (IntegerStatisticsImpl) other; + if (!hasMinimum) { + hasMinimum = otherInt.hasMinimum; + minimum = otherInt.minimum; + maximum = otherInt.maximum; + } else if (otherInt.hasMinimum) { + if (otherInt.minimum < minimum) { + minimum = otherInt.minimum; + } + if (otherInt.maximum > maximum) { + maximum = otherInt.maximum; + } + } + + overflow |= otherInt.overflow; + if (!overflow) { + boolean wasPositive = sum >= 0; + sum += otherInt.sum; + if ((otherInt.sum >= 0) == wasPositive) { + overflow = (sum >= 0) != wasPositive; + } + } + } else { + if (isStatsExists() && hasMinimum) { + throw new IllegalArgumentException("Incompatible merging of integer column statistics"); + } + } + super.merge(other); + } + + @Override + OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder builder = super.serialize(); + OrcProto.IntegerStatistics.Builder intb = + OrcProto.IntegerStatistics.newBuilder(); + if (hasMinimum) { + intb.setMinimum(minimum); + intb.setMaximum(maximum); + } + if (!overflow) { + intb.setSum(sum); + } + builder.setIntStatistics(intb); + return builder; + } + + @Override + public long getMinimum() { + return minimum; + } + + @Override + public long getMaximum() { + return maximum; + } + + @Override + public boolean isSumDefined() { + return !overflow; + } + + @Override + public long getSum() { + return sum; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(super.toString()); + if (hasMinimum) { + buf.append(" min: "); + buf.append(minimum); + buf.append(" max: "); + buf.append(maximum); + } + if (!overflow) { + buf.append(" sum: "); + buf.append(sum); + } + return buf.toString(); + } + } + + private static final class DoubleStatisticsImpl extends ColumnStatisticsImpl + implements DoubleColumnStatistics { + private boolean hasMinimum = false; + private double minimum = Double.MAX_VALUE; + private double maximum = Double.MIN_VALUE; + private double sum = 0; + + DoubleStatisticsImpl() { + } + + DoubleStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + OrcProto.DoubleStatistics dbl = stats.getDoubleStatistics(); + if (dbl.hasMinimum()) { + hasMinimum = true; + minimum = dbl.getMinimum(); + } + if (dbl.hasMaximum()) { + maximum = dbl.getMaximum(); + } + if (dbl.hasSum()) { + sum = dbl.getSum(); + } + } + + @Override + void reset() { + super.reset(); + hasMinimum = false; + minimum = Double.MAX_VALUE; + maximum = Double.MIN_VALUE; + sum = 0; + } + + @Override + void updateDouble(double value) { + if (!hasMinimum) { + hasMinimum = true; + minimum = value; + maximum = value; + } else if (value < minimum) { + minimum = value; + } else if (value > maximum) { + maximum = value; + } + sum += value; + } + + @Override + void merge(ColumnStatisticsImpl other) { + if (other instanceof DoubleStatisticsImpl) { + DoubleStatisticsImpl dbl = (DoubleStatisticsImpl) other; + if (!hasMinimum) { + hasMinimum = dbl.hasMinimum; + minimum = dbl.minimum; + maximum = dbl.maximum; + } else if (dbl.hasMinimum) { + if (dbl.minimum < minimum) { + minimum = dbl.minimum; + } + if (dbl.maximum > maximum) { + maximum = dbl.maximum; + } + } + sum += dbl.sum; + } else { + if (isStatsExists() && hasMinimum) { + throw new IllegalArgumentException("Incompatible merging of double column statistics"); + } + } + super.merge(other); + } + + @Override + OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder builder = super.serialize(); + OrcProto.DoubleStatistics.Builder dbl = + OrcProto.DoubleStatistics.newBuilder(); + if (hasMinimum) { + dbl.setMinimum(minimum); + dbl.setMaximum(maximum); + } + dbl.setSum(sum); + builder.setDoubleStatistics(dbl); + return builder; + } + + @Override + public double getMinimum() { + return minimum; + } + + @Override + public double getMaximum() { + return maximum; + } + + @Override + public double getSum() { + return sum; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(super.toString()); + if (hasMinimum) { + buf.append(" min: "); + buf.append(minimum); + buf.append(" max: "); + buf.append(maximum); + } + buf.append(" sum: "); + buf.append(sum); + return buf.toString(); + } + } + + protected static final class StringStatisticsImpl extends ColumnStatisticsImpl + implements StringColumnStatistics { + private Text minimum = null; + private Text maximum = null; + private long sum = 0; + + StringStatisticsImpl() { + } + + StringStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + OrcProto.StringStatistics str = stats.getStringStatistics(); + if (str.hasMaximum()) { + maximum = new Text(str.getMaximum()); + } + if (str.hasMinimum()) { + minimum = new Text(str.getMinimum()); + } + if(str.hasSum()) { + sum = str.getSum(); + } + } + + @Override + void reset() { + super.reset(); + minimum = null; + maximum = null; + sum = 0; + } + + @Override + void updateString(Text value) { + if (minimum == null) { + maximum = minimum = new Text(value); + } else if (minimum.compareTo(value) > 0) { + minimum = new Text(value); + } else if (maximum.compareTo(value) < 0) { + maximum = new Text(value); + } + sum += value.getLength(); + } + + @Override + void merge(ColumnStatisticsImpl other) { + if (other instanceof StringStatisticsImpl) { + StringStatisticsImpl str = (StringStatisticsImpl) other; + if (minimum == null) { + if (str.minimum != null) { + maximum = new Text(str.getMaximum()); + minimum = new Text(str.getMinimum()); + } else { + /* both are empty */ + maximum = minimum = null; + } + } else if (str.minimum != null) { + if (minimum.compareTo(str.minimum) > 0) { + minimum = new Text(str.getMinimum()); + } + if (maximum.compareTo(str.maximum) < 0) { + maximum = new Text(str.getMaximum()); + } + } + sum += str.sum; + } else { + if (isStatsExists() && minimum != null) { + throw new IllegalArgumentException("Incompatible merging of string column statistics"); + } + } + super.merge(other); + } + + @Override + OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder result = super.serialize(); + OrcProto.StringStatistics.Builder str = + OrcProto.StringStatistics.newBuilder(); + if (getNumberOfValues() != 0) { + str.setMinimum(getMinimum()); + str.setMaximum(getMaximum()); + str.setSum(sum); + } + result.setStringStatistics(str); + return result; + } + + @Override + public String getMinimum() { + return minimum == null ? null : minimum.toString(); + } + + @Override + public String getMaximum() { + return maximum == null ? null : maximum.toString(); + } + + @Override + public long getSum() { + return sum; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(super.toString()); + if (getNumberOfValues() != 0) { + buf.append(" min: "); + buf.append(getMinimum()); + buf.append(" max: "); + buf.append(getMaximum()); + buf.append(" sum: "); + buf.append(sum); + } + return buf.toString(); + } + } + + protected static final class BinaryStatisticsImpl extends ColumnStatisticsImpl implements + BinaryColumnStatistics { + + private long sum = 0; + + BinaryStatisticsImpl() { + } + + BinaryStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + OrcProto.BinaryStatistics binStats = stats.getBinaryStatistics(); + if (binStats.hasSum()) { + sum = binStats.getSum(); + } + } + + @Override + void reset() { + super.reset(); + sum = 0; + } + + @Override + void updateBinary(BytesWritable value) { + sum += value.getLength(); + } + + @Override + void merge(ColumnStatisticsImpl other) { + if (other instanceof BinaryColumnStatistics) { + BinaryStatisticsImpl bin = (BinaryStatisticsImpl) other; + sum += bin.sum; + } else { + if (isStatsExists() && sum != 0) { + throw new IllegalArgumentException("Incompatible merging of binary column statistics"); + } + } + super.merge(other); + } + + @Override + public long getSum() { + return sum; + } + + @Override + OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder result = super.serialize(); + OrcProto.BinaryStatistics.Builder bin = OrcProto.BinaryStatistics.newBuilder(); + bin.setSum(sum); + result.setBinaryStatistics(bin); + return result; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(super.toString()); + if (getNumberOfValues() != 0) { + buf.append(" sum: "); + buf.append(sum); + } + return buf.toString(); + } + } + + private static final class DecimalStatisticsImpl extends ColumnStatisticsImpl + implements DecimalColumnStatistics { + private HiveDecimal minimum = null; + private HiveDecimal maximum = null; + private HiveDecimal sum = HiveDecimal.ZERO; + + DecimalStatisticsImpl() { + } + + DecimalStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + OrcProto.DecimalStatistics dec = stats.getDecimalStatistics(); + if (dec.hasMaximum()) { + maximum = HiveDecimal.create(dec.getMaximum()); + } + if (dec.hasMinimum()) { + minimum = HiveDecimal.create(dec.getMinimum()); + } + if (dec.hasSum()) { + sum = HiveDecimal.create(dec.getSum()); + } else { + sum = null; + } + } + + @Override + void reset() { + super.reset(); + minimum = null; + maximum = null; + sum = HiveDecimal.ZERO; + } + + @Override + void updateDecimal(HiveDecimal value) { + if (minimum == null) { + minimum = value; + maximum = value; + } else if (minimum.compareTo(value) > 0) { + minimum = value; + } else if (maximum.compareTo(value) < 0) { + maximum = value; + } + if (sum != null) { + sum = sum.add(value); + } + } + + @Override + void merge(ColumnStatisticsImpl other) { + if (other instanceof DecimalStatisticsImpl) { + DecimalStatisticsImpl dec = (DecimalStatisticsImpl) other; + if (minimum == null) { + minimum = dec.minimum; + maximum = dec.maximum; + sum = dec.sum; + } else if (dec.minimum != null) { + if (minimum.compareTo(dec.minimum) > 0) { + minimum = dec.minimum; + } + if (maximum.compareTo(dec.maximum) < 0) { + maximum = dec.maximum; + } + if (sum == null || dec.sum == null) { + sum = null; + } else { + sum = sum.add(dec.sum); + } + } + } else { + if (isStatsExists() && minimum != null) { + throw new IllegalArgumentException("Incompatible merging of decimal column statistics"); + } + } + super.merge(other); + } + + @Override + OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder result = super.serialize(); + OrcProto.DecimalStatistics.Builder dec = + OrcProto.DecimalStatistics.newBuilder(); + if (getNumberOfValues() != 0 && minimum != null) { + dec.setMinimum(minimum.toString()); + dec.setMaximum(maximum.toString()); + } + if (sum != null) { + dec.setSum(sum.toString()); + } + result.setDecimalStatistics(dec); + return result; + } + + @Override + public HiveDecimal getMinimum() { + return minimum; + } + + @Override + public HiveDecimal getMaximum() { + return maximum; + } + + @Override + public HiveDecimal getSum() { + return sum; + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(super.toString()); + if (getNumberOfValues() != 0) { + buf.append(" min: "); + buf.append(minimum); + buf.append(" max: "); + buf.append(maximum); + if (sum != null) { + buf.append(" sum: "); + buf.append(sum); + } + } + return buf.toString(); + } + } + + private static final class DateStatisticsImpl extends ColumnStatisticsImpl + implements DateColumnStatistics { + private Integer minimum = null; + private Integer maximum = null; + + DateStatisticsImpl() { + } + + DateStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + OrcProto.DateStatistics dateStats = stats.getDateStatistics(); + // min,max values serialized/deserialized as int (days since epoch) + if (dateStats.hasMaximum()) { + maximum = dateStats.getMaximum(); + } + if (dateStats.hasMinimum()) { + minimum = dateStats.getMinimum(); + } + } + + @Override + void reset() { + super.reset(); + minimum = null; + maximum = null; + } + + @Override + void updateDate(DateWritable value) { + if (minimum == null) { + minimum = value.getDays(); + maximum = value.getDays(); + } else if (minimum > value.getDays()) { + minimum = value.getDays(); + } else if (maximum < value.getDays()) { + maximum = value.getDays(); + } + } + + @Override + void merge(ColumnStatisticsImpl other) { + if (other instanceof DateStatisticsImpl) { + DateStatisticsImpl dateStats = (DateStatisticsImpl) other; + if (minimum == null) { + minimum = dateStats.minimum; + maximum = dateStats.maximum; + } else if (dateStats.minimum != null) { + if (minimum > dateStats.minimum) { + minimum = dateStats.minimum; + } + if (maximum < dateStats.maximum) { + maximum = dateStats.maximum; + } + } + } else { + if (isStatsExists() && minimum != null) { + throw new IllegalArgumentException("Incompatible merging of date column statistics"); + } + } + super.merge(other); + } + + @Override + OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder result = super.serialize(); + OrcProto.DateStatistics.Builder dateStats = + OrcProto.DateStatistics.newBuilder(); + if (getNumberOfValues() != 0 && minimum != null) { + dateStats.setMinimum(minimum); + dateStats.setMaximum(maximum); + } + result.setDateStatistics(dateStats); + return result; + } + + private transient final DateWritable minDate = new DateWritable(); + private transient final DateWritable maxDate = new DateWritable(); + + @Override + public Date getMinimum() { + if (minimum == null) { + return null; + } + minDate.set(minimum); + return minDate.get(); + } + + @Override + public Date getMaximum() { + if (maximum == null) { + return null; + } + maxDate.set(maximum); + return maxDate.get(); + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(super.toString()); + if (getNumberOfValues() != 0) { + buf.append(" min: "); + buf.append(getMinimum()); + buf.append(" max: "); + buf.append(getMaximum()); + } + return buf.toString(); + } + } + + private static final class TimestampStatisticsImpl extends ColumnStatisticsImpl + implements TimestampColumnStatistics { + private Long minimum = null; + private Long maximum = null; + + TimestampStatisticsImpl() { + } + + TimestampStatisticsImpl(OrcProto.ColumnStatistics stats) { + super(stats); + OrcProto.TimestampStatistics timestampStats = stats.getTimestampStatistics(); + // min,max values serialized/deserialized as int (milliseconds since epoch) + if (timestampStats.hasMaximum()) { + maximum = timestampStats.getMaximum(); + } + if (timestampStats.hasMinimum()) { + minimum = timestampStats.getMinimum(); + } + } + + @Override + void reset() { + super.reset(); + minimum = null; + maximum = null; + } + + @Override + void updateTimestamp(Timestamp value) { + if (minimum == null) { + minimum = value.getTime(); + maximum = value.getTime(); + } else if (minimum > value.getTime()) { + minimum = value.getTime(); + } else if (maximum < value.getTime()) { + maximum = value.getTime(); + } + } + + @Override + void merge(ColumnStatisticsImpl other) { + if (other instanceof TimestampStatisticsImpl) { + TimestampStatisticsImpl timestampStats = (TimestampStatisticsImpl) other; + if (minimum == null) { + minimum = timestampStats.minimum; + maximum = timestampStats.maximum; + } else if (timestampStats.minimum != null) { + if (minimum > timestampStats.minimum) { + minimum = timestampStats.minimum; + } + if (maximum < timestampStats.maximum) { + maximum = timestampStats.maximum; + } + } + } else { + if (isStatsExists() && minimum != null) { + throw new IllegalArgumentException("Incompatible merging of timestamp column statistics"); + } + } + super.merge(other); + } + + @Override + OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder result = super.serialize(); + OrcProto.TimestampStatistics.Builder timestampStats = OrcProto.TimestampStatistics + .newBuilder(); + if (getNumberOfValues() != 0 && minimum != null) { + timestampStats.setMinimum(minimum); + timestampStats.setMaximum(maximum); + } + result.setTimestampStatistics(timestampStats); + return result; + } + + @Override + public Timestamp getMinimum() { + return minimum == null ? null : new Timestamp(minimum); + } + + @Override + public Timestamp getMaximum() { + return maximum == null ? null : new Timestamp(maximum); + } + + @Override + public String toString() { + StringBuilder buf = new StringBuilder(super.toString()); + if (getNumberOfValues() != 0) { + buf.append(" min: "); + buf.append(getMinimum()); + buf.append(" max: "); + buf.append(getMaximum()); + } + return buf.toString(); + } + } + + private long count = 0; + private boolean hasNull = false; + + ColumnStatisticsImpl(OrcProto.ColumnStatistics stats) { + if (stats.hasNumberOfValues()) { + count = stats.getNumberOfValues(); + } + + if (stats.hasHasNull()) { + hasNull = stats.getHasNull(); + } else { + hasNull = true; + } + } + + ColumnStatisticsImpl() { + } + + void increment() { + count += 1; + } + + void setNull() { + hasNull = true; + } + + void updateBoolean(boolean value) { + throw new UnsupportedOperationException("Can't update boolean"); + } + + void updateInteger(long value) { + throw new UnsupportedOperationException("Can't update integer"); + } + + void updateDouble(double value) { + throw new UnsupportedOperationException("Can't update double"); + } + + void updateString(Text value) { + throw new UnsupportedOperationException("Can't update string"); + } + + void updateBinary(BytesWritable value) { + throw new UnsupportedOperationException("Can't update binary"); + } + + void updateDecimal(HiveDecimal value) { + throw new UnsupportedOperationException("Can't update decimal"); + } + + void updateDate(DateWritable value) { + throw new UnsupportedOperationException("Can't update date"); + } + + void updateTimestamp(Timestamp value) { + throw new UnsupportedOperationException("Can't update timestamp"); + } + + boolean isStatsExists() { + return (count > 0 || hasNull == true); + } + + void merge(ColumnStatisticsImpl stats) { + count += stats.count; + hasNull |= stats.hasNull; + } + + void reset() { + count = 0; + hasNull = false; + } + + @Override + public long getNumberOfValues() { + return count; + } + + @Override + public boolean hasNull() { + return hasNull; + } + + @Override + public String toString() { + return "count: " + count + " hasNull: " + hasNull; + } + + OrcProto.ColumnStatistics.Builder serialize() { + OrcProto.ColumnStatistics.Builder builder = + OrcProto.ColumnStatistics.newBuilder(); + builder.setNumberOfValues(count); + builder.setHasNull(hasNull); + return builder; + } + + static ColumnStatisticsImpl create(ObjectInspector inspector) { + switch (inspector.getCategory()) { + case PRIMITIVE: + switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) { + case BOOLEAN: + return new BooleanStatisticsImpl(); + case BYTE: + case SHORT: + case INT: + case LONG: + return new IntegerStatisticsImpl(); + case FLOAT: + case DOUBLE: + return new DoubleStatisticsImpl(); + case STRING: + case CHAR: + case VARCHAR: + return new StringStatisticsImpl(); + case DECIMAL: + return new DecimalStatisticsImpl(); + case DATE: + return new DateStatisticsImpl(); + case TIMESTAMP: + return new TimestampStatisticsImpl(); + case BINARY: + return new BinaryStatisticsImpl(); + default: + return new ColumnStatisticsImpl(); + } + default: + return new ColumnStatisticsImpl(); + } + } + + static ColumnStatisticsImpl deserialize(OrcProto.ColumnStatistics stats) { + if (stats.hasBucketStatistics()) { + return new BooleanStatisticsImpl(stats); + } else if (stats.hasIntStatistics()) { + return new IntegerStatisticsImpl(stats); + } else if (stats.hasDoubleStatistics()) { + return new DoubleStatisticsImpl(stats); + } else if (stats.hasStringStatistics()) { + return new StringStatisticsImpl(stats); + } else if (stats.hasDecimalStatistics()) { + return new DecimalStatisticsImpl(stats); + } else if (stats.hasDateStatistics()) { + return new DateStatisticsImpl(stats); + } else if (stats.hasTimestampStatistics()) { + return new TimestampStatisticsImpl(stats); + } else if(stats.hasBinaryStatistics()) { + return new BinaryStatisticsImpl(stats); + } else { + return new ColumnStatisticsImpl(stats); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionCodec.java new file mode 100644 index 0000000000..769ca50b21 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionCodec.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.EnumSet; + +public interface CompressionCodec { + + public enum Modifier { + /* speed/compression tradeoffs */ + FASTEST, + FAST, + DEFAULT, + /* data sensitivity modifiers */ + TEXT, + BINARY + }; + + /** + * Compress the in buffer to the out buffer. + * @param in the bytes to compress + * @param out the uncompressed bytes + * @param overflow put any additional bytes here + * @return true if the output is smaller than input + * @throws IOException + */ + boolean compress(ByteBuffer in, ByteBuffer out, ByteBuffer overflow + ) throws IOException; + + /** + * Decompress the in buffer to the out buffer. + * @param in the bytes to decompress + * @param out the decompressed bytes + * @throws IOException + */ + void decompress(ByteBuffer in, ByteBuffer out) throws IOException; + + /** + * Produce a modified compression codec if the underlying algorithm allows + * modification. + * + * This does not modify the current object, but returns a new object if + * modifications are possible. Returns the same object if no modifications + * are possible. + * @param modifiers compression modifiers + * @return codec for use after optional modification + */ + CompressionCodec modify(@Nullable EnumSet modifiers); + +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionKind.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionKind.java new file mode 100644 index 0000000000..8b16c6711f --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/CompressionKind.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +/** + * An enumeration that lists the generic compression algorithms that + * can be applied to ORC files. + */ +public enum CompressionKind { + NONE, ZLIB, SNAPPY, LZO +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DateColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DateColumnStatistics.java new file mode 100644 index 0000000000..cb3405e8da --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DateColumnStatistics.java @@ -0,0 +1,37 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import java.util.Date; + +/** + * Statistics for DATE columns. + */ +public interface DateColumnStatistics extends ColumnStatistics { + /** + * Get the minimum value for the column. + * @return minimum value + */ + Date getMinimum(); + + /** + * Get the maximum value for the column. + * @return maximum value + */ + Date getMaximum(); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DecimalColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DecimalColumnStatistics.java new file mode 100644 index 0000000000..27cdac2187 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DecimalColumnStatistics.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.hive.common.type.HiveDecimal; + +/** + * Statistics for decimal columns. + */ +public interface DecimalColumnStatistics extends ColumnStatistics { + + /** + * Get the minimum value for the column. + * @return the minimum value + */ + HiveDecimal getMinimum(); + + /** + * Get the maximum value for the column. + * @return the maximum value + */ + HiveDecimal getMaximum(); + + /** + * Get the sum of the values of the column. + * @return the sum + */ + HiveDecimal getSum(); + +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DirectDecompressionCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DirectDecompressionCodec.java new file mode 100644 index 0000000000..53330523b5 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DirectDecompressionCodec.java @@ -0,0 +1,26 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import java.io.IOException; +import java.nio.ByteBuffer; + +public interface DirectDecompressionCodec extends CompressionCodec { + public boolean isAvailable(); + public void directDecompress(ByteBuffer in, ByteBuffer out) throws IOException; +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleColumnStatistics.java new file mode 100644 index 0000000000..ddce8f7078 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DoubleColumnStatistics.java @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +/** + * Statistics for float and double columns. + */ +public interface DoubleColumnStatistics extends ColumnStatistics { + + /** + * Get the smallest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the minimum + */ + double getMinimum(); + + /** + * Get the largest value in the column. Only defined if getNumberOfValues + * is non-zero. + * @return the maximum + */ + double getMaximum(); + + /** + * Get the sum of the values in the column. + * @return the sum + */ + double getSum(); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicByteArray.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicByteArray.java new file mode 100644 index 0000000000..1d44f77dba --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicByteArray.java @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.io.Text; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.ByteBuffer; + +/** + * A class that is a growable array of bytes. Growth is managed in terms of + * chunks that are allocated when needed. + */ +final class DynamicByteArray { + static final int DEFAULT_CHUNKSIZE = 32 * 1024; + static final int DEFAULT_NUM_CHUNKS = 128; + + private final int chunkSize; // our allocation sizes + private byte[][] data; // the real data + private int length; // max set element index +1 + private int initializedChunks = 0; // the number of chunks created + + public DynamicByteArray() { + this(DEFAULT_NUM_CHUNKS, DEFAULT_CHUNKSIZE); + } + + public DynamicByteArray(int numChunks, int chunkSize) { + if (chunkSize == 0) { + throw new IllegalArgumentException("bad chunksize"); + } + this.chunkSize = chunkSize; + data = new byte[numChunks][]; + } + + /** + * Ensure that the given index is valid. + */ + private void grow(int chunkIndex) { + if (chunkIndex >= initializedChunks) { + if (chunkIndex >= data.length) { + int newSize = Math.max(chunkIndex + 1, 2 * data.length); + byte[][] newChunk = new byte[newSize][]; + System.arraycopy(data, 0, newChunk, 0, data.length); + data = newChunk; + } + for(int i=initializedChunks; i <= chunkIndex; ++i) { + data[i] = new byte[chunkSize]; + } + initializedChunks = chunkIndex + 1; + } + } + + public byte get(int index) { + if (index >= length) { + throw new IndexOutOfBoundsException("Index " + index + + " is outside of 0.." + + (length - 1)); + } + int i = index / chunkSize; + int j = index % chunkSize; + return data[i][j]; + } + + public void set(int index, byte value) { + int i = index / chunkSize; + int j = index % chunkSize; + grow(i); + if (index >= length) { + length = index + 1; + } + data[i][j] = value; + } + + public int add(byte value) { + int i = length / chunkSize; + int j = length % chunkSize; + grow(i); + data[i][j] = value; + int result = length; + length += 1; + return result; + } + + /** + * Copy a slice of a byte array into our buffer. + * @param value the array to copy from + * @param valueOffset the first location to copy from value + * @param valueLength the number of bytes to copy from value + * @return the offset of the start of the value + */ + public int add(byte[] value, int valueOffset, int valueLength) { + int i = length / chunkSize; + int j = length % chunkSize; + grow((length + valueLength) / chunkSize); + int remaining = valueLength; + while (remaining > 0) { + int size = Math.min(remaining, chunkSize - j); + System.arraycopy(value, valueOffset, data[i], j, size); + remaining -= size; + valueOffset += size; + i += 1; + j = 0; + } + int result = length; + length += valueLength; + return result; + } + + /** + * Read the entire stream into this array. + * @param in the stream to read from + * @throws IOException + */ + public void readAll(InputStream in) throws IOException { + int currentChunk = length / chunkSize; + int currentOffset = length % chunkSize; + grow(currentChunk); + int currentLength = in.read(data[currentChunk], currentOffset, + chunkSize - currentOffset); + while (currentLength > 0) { + length += currentLength; + currentOffset = length % chunkSize; + if (currentOffset == 0) { + currentChunk = length / chunkSize; + grow(currentChunk); + } + currentLength = in.read(data[currentChunk], currentOffset, + chunkSize - currentOffset); + } + } + + /** + * Byte compare a set of bytes against the bytes in this dynamic array. + * @param other source of the other bytes + * @param otherOffset start offset in the other array + * @param otherLength number of bytes in the other array + * @param ourOffset the offset in our array + * @param ourLength the number of bytes in our array + * @return negative for less, 0 for equal, positive for greater + */ + public int compare(byte[] other, int otherOffset, int otherLength, + int ourOffset, int ourLength) { + int currentChunk = ourOffset / chunkSize; + int currentOffset = ourOffset % chunkSize; + int maxLength = Math.min(otherLength, ourLength); + while (maxLength > 0 && + other[otherOffset] == data[currentChunk][currentOffset]) { + otherOffset += 1; + currentOffset += 1; + if (currentOffset == chunkSize) { + currentChunk += 1; + currentOffset = 0; + } + maxLength -= 1; + } + if (maxLength == 0) { + return otherLength - ourLength; + } + int otherByte = 0xff & other[otherOffset]; + int ourByte = 0xff & data[currentChunk][currentOffset]; + return otherByte > ourByte ? 1 : -1; + } + + /** + * Get the size of the array. + * @return the number of bytes in the array + */ + public int size() { + return length; + } + + /** + * Clear the array to its original pristine state. + */ + public void clear() { + length = 0; + for(int i=0; i < data.length; ++i) { + data[i] = null; + } + initializedChunks = 0; + } + + /** + * Set a text value from the bytes in this dynamic array. + * @param result the value to set + * @param offset the start of the bytes to copy + * @param length the number of bytes to copy + */ + public void setText(Text result, int offset, int length) { + result.clear(); + int currentChunk = offset / chunkSize; + int currentOffset = offset % chunkSize; + int currentLength = Math.min(length, chunkSize - currentOffset); + while (length > 0) { + result.append(data[currentChunk], currentOffset, currentLength); + length -= currentLength; + currentChunk += 1; + currentOffset = 0; + currentLength = Math.min(length, chunkSize - currentOffset); + } + } + + /** + * Write out a range of this dynamic array to an output stream. + * @param out the stream to write to + * @param offset the first offset to write + * @param length the number of bytes to write + * @throws IOException + */ + public void write(OutputStream out, int offset, + int length) throws IOException { + int currentChunk = offset / chunkSize; + int currentOffset = offset % chunkSize; + while (length > 0) { + int currentLength = Math.min(length, chunkSize - currentOffset); + out.write(data[currentChunk], currentOffset, currentLength); + length -= currentLength; + currentChunk += 1; + currentOffset = 0; + } + } + + @Override + public String toString() { + int i; + StringBuilder sb = new StringBuilder(length * 3); + + sb.append('{'); + int l = length - 1; + for (i=0; i 0) { + result.put(data[currentChunk], currentOffset, currentLength); + length -= currentLength; + currentChunk += 1; + currentOffset = 0; + currentLength = Math.min(length, chunkSize - currentOffset); + } + } + + /** + * Gets all the bytes of the array. + * + * @return Bytes of the array + */ + public byte[] get() { + byte[] result = null; + if (length > 0) { + int currentChunk = 0; + int currentOffset = 0; + int currentLength = Math.min(length, chunkSize); + int destOffset = 0; + result = new byte[length]; + int totalLength = length; + while (totalLength > 0) { + System.arraycopy(data[currentChunk], currentOffset, result, destOffset, currentLength); + destOffset += currentLength; + totalLength -= currentLength; + currentChunk += 1; + currentOffset = 0; + currentLength = Math.min(totalLength, chunkSize - currentOffset); + } + } + return result; + } + + /** + * Get the size of the buffers. + */ + public long getSizeInBytes() { + return initializedChunks * chunkSize; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicIntArray.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicIntArray.java new file mode 100644 index 0000000000..a34770663d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/DynamicIntArray.java @@ -0,0 +1,142 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +/** + * Dynamic int array that uses primitive types and chunks to avoid copying + * large number of integers when it resizes. + * + * The motivation for this class is memory optimization, i.e. space efficient + * storage of potentially huge arrays without good a-priori size guesses. + * + * The API of this class is between a primitive array and a AbstractList. It's + * not a Collection implementation because it handles primitive types, but the + * API could be extended to support iterators and the like. + * + * NOTE: Like standard Collection implementations/arrays, this class is not + * synchronized. + */ +final class DynamicIntArray { + static final int DEFAULT_CHUNKSIZE = 8 * 1024; + static final int INIT_CHUNKS = 128; + + private final int chunkSize; // our allocation size + private int[][] data; // the real data + private int length; // max set element index +1 + private int initializedChunks = 0; // the number of created chunks + + public DynamicIntArray() { + this(DEFAULT_CHUNKSIZE); + } + + public DynamicIntArray(int chunkSize) { + this.chunkSize = chunkSize; + + data = new int[INIT_CHUNKS][]; + } + + /** + * Ensure that the given index is valid. + */ + private void grow(int chunkIndex) { + if (chunkIndex >= initializedChunks) { + if (chunkIndex >= data.length) { + int newSize = Math.max(chunkIndex + 1, 2 * data.length); + int[][] newChunk = new int[newSize][]; + System.arraycopy(data, 0, newChunk, 0, data.length); + data = newChunk; + } + for (int i=initializedChunks; i <= chunkIndex; ++i) { + data[i] = new int[chunkSize]; + } + initializedChunks = chunkIndex + 1; + } + } + + public int get(int index) { + if (index >= length) { + throw new IndexOutOfBoundsException("Index " + index + + " is outside of 0.." + + (length - 1)); + } + int i = index / chunkSize; + int j = index % chunkSize; + return data[i][j]; + } + + public void set(int index, int value) { + int i = index / chunkSize; + int j = index % chunkSize; + grow(i); + if (index >= length) { + length = index + 1; + } + data[i][j] = value; + } + + public void increment(int index, int value) { + int i = index / chunkSize; + int j = index % chunkSize; + grow(i); + if (index >= length) { + length = index + 1; + } + data[i][j] += value; + } + + public void add(int value) { + int i = length / chunkSize; + int j = length % chunkSize; + grow(i); + data[i][j] = value; + length += 1; + } + + public int size() { + return length; + } + + public void clear() { + length = 0; + for(int i=0; i < data.length; ++i) { + data[i] = null; + } + initializedChunks = 0; + } + + public String toString() { + int i; + StringBuilder sb = new StringBuilder(length * 4); + + sb.append('{'); + int l = length - 1; + for (i=0; i writerList = + new HashMap(); + private long totalAllocation = 0; + private double currentScale = 1; + private int rowsAddedSinceCheck = 0; + + private static class WriterInfo { + long allocation; + Callback callback; + WriterInfo(long allocation, Callback callback) { + this.allocation = allocation; + this.callback = callback; + } + } + + public interface Callback { + /** + * The writer needs to check its memory usage + * @param newScale the current scale factor for memory allocations + * @return true if the writer was over the limit + * @throws IOException + */ + boolean checkMemory(double newScale) throws IOException; + } + + /** + * Create the memory manager. + * @param conf use the configuration to find the maximum size of the memory + * pool. + */ + MemoryManager(Configuration conf) { + HiveConf.ConfVars poolVar = HiveConf.ConfVars.HIVE_ORC_FILE_MEMORY_POOL; + double maxLoad = conf.getFloat(poolVar.varname, poolVar.defaultFloatVal); + totalMemoryPool = Math.round(ManagementFactory.getMemoryMXBean(). + getHeapMemoryUsage().getMax() * maxLoad); + } + + /** + * Add a new writer's memory allocation to the pool. We use the path + * as a unique key to ensure that we don't get duplicates. + * @param path the file that is being written + * @param requestedAllocation the requested buffer size + */ + synchronized void addWriter(Path path, long requestedAllocation, + Callback callback) throws IOException { + WriterInfo oldVal = writerList.get(path); + // this should always be null, but we handle the case where the memory + // manager wasn't told that a writer wasn't still in use and the task + // starts writing to the same path. + if (oldVal == null) { + oldVal = new WriterInfo(requestedAllocation, callback); + writerList.put(path, oldVal); + totalAllocation += requestedAllocation; + } else { + // handle a new writer that is writing to the same path + totalAllocation += requestedAllocation - oldVal.allocation; + oldVal.allocation = requestedAllocation; + oldVal.callback = callback; + } + updateScale(true); + } + + /** + * Remove the given writer from the pool. + * @param path the file that has been closed + */ + synchronized void removeWriter(Path path) throws IOException { + WriterInfo val = writerList.get(path); + if (val != null) { + writerList.remove(path); + totalAllocation -= val.allocation; + if (writerList.isEmpty()) { + rowsAddedSinceCheck = 0; + } + updateScale(false); + } + if(writerList.isEmpty()) { + rowsAddedSinceCheck = 0; + } + } + + /** + * Get the total pool size that is available for ORC writers. + * @return the number of bytes in the pool + */ + long getTotalMemoryPool() { + return totalMemoryPool; + } + + /** + * The scaling factor for each allocation to ensure that the pool isn't + * oversubscribed. + * @return a fraction between 0.0 and 1.0 of the requested size that is + * available for each writer. + */ + synchronized double getAllocationScale() { + return currentScale; + } + + /** + * Give the memory manager an opportunity for doing a memory check. + * @throws IOException + */ + synchronized void addedRow() throws IOException { + if (++rowsAddedSinceCheck >= ROWS_BETWEEN_CHECKS) { + notifyWriters(); + } + } + + /** + * Notify all of the writers that they should check their memory usage. + * @throws IOException + */ + void notifyWriters() throws IOException { + LOG.debug("Notifying writers after " + rowsAddedSinceCheck); + for(WriterInfo writer: writerList.values()) { + boolean flushed = writer.callback.checkMemory(currentScale); + if (LOG.isDebugEnabled() && flushed) { + LOG.debug("flushed " + writer.toString()); + } + } + rowsAddedSinceCheck = 0; + } + + /** + * Update the currentScale based on the current allocation and pool size. + * This also updates the notificationTrigger. + * @param isAllocate is this an allocation? + */ + private void updateScale(boolean isAllocate) throws IOException { + if (totalAllocation <= totalMemoryPool) { + currentScale = 1; + } else { + currentScale = (double) totalMemoryPool / totalAllocation; + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Metadata.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Metadata.java new file mode 100644 index 0000000000..dfa4c36d1b --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Metadata.java @@ -0,0 +1,45 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.Lists; + +import java.util.List; + +public class Metadata { + + private final OrcProto.Metadata metadata; + + Metadata(OrcProto.Metadata m) { + this.metadata = m; + } + + /** + * Return list of stripe level column statistics + * + * @return list of stripe statistics + */ + public List getStripeStatistics() { + List result = Lists.newArrayList(); + for (OrcProto.StripeStatistics ss : metadata.getStripeStatsList()) { + result.add(new StripeStatistics(ss.getColStatsList())); + } + return result; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcConf.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcConf.java new file mode 100644 index 0000000000..b704666524 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcConf.java @@ -0,0 +1,149 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. + +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.conf.Configuration; + +// All configs in this class also appear in HiveConf, so any changes here should also be made there +// This is because only HiveConf can provide type checking through the CLI, and Presto depends on +// open source Hive, and so won't work with any variables not in open source HiveConf +public class OrcConf { + + public enum ConfVars { + HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD("hive.exec.orc.dictionary.key.size.threshold", 0.8f), + + // Maximum fraction of heap that can be used by ORC file writers + HIVE_ORC_FILE_MEMORY_POOL("hive.exec.orc.memory.pool", 0.5f), // 50% + HIVE_ORC_FILE_MIN_MEMORY_ALLOCATION("hive.exec.orc.min.mem.allocation", 4194304L), // 4 Mb + HIVE_ORC_FILE_ENABLE_LOW_MEMORY_MODE("hive.exec.orc.low.memory", false), + HIVE_ORC_ROW_BUFFER_SIZE("hive.exec.orc.row.buffer.size", 100), + + HIVE_ORC_EAGER_HDFS_READ("hive.exec.orc.eager.hdfs.read", true), + HIVE_ORC_EAGER_HDFS_READ_BYTES("hive.exec.orc.eager.hdfs.read.bytes", 193986560), // 185 Mb + + HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK("hive.orc.row.index.stride.dictionary.check", true), + HIVE_ORC_DEFAULT_STRIPE_SIZE("hive.exec.orc.default.stripe.size", 64L * 1024 * 1024), + HIVE_ORC_DEFAULT_BLOCK_SIZE("hive.exec.orc.default.block.size", 256L * 1024 * 1024), + HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE("hive.exec.orc.default.row.index.stride", 10000), + HIVE_ORC_DEFAULT_BUFFER_SIZE("hive.exec.orc.default.buffer.size", 256 * 1024), + HIVE_ORC_DEFAULT_BLOCK_PADDING("hive.exec.orc.default.block.padding", true), + HIVE_ORC_DEFAULT_COMPRESS("hive.exec.orc.default.compress", "ZLIB"), + HIVE_ORC_WRITE_FORMAT("hive.exec.orc.write.format", null), // 0.11 or 0.12 + HIVE_ORC_ENCODING_STRATEGY("hive.exec.orc.encoding.strategy", "SPEED"), + HIVE_ORC_COMPRESSION_STRATEGY("hive.exec.orc.compression.strategy", "SPEED"), + HIVE_ORC_BLOCK_PADDING_TOLERANCE("hive.exec.orc.block.padding.tolerance", 0.05f), + + ; + + public final String varname; + public final String defaultVal; + public final int defaultIntVal; + public final long defaultLongVal; + public final float defaultFloatVal; + public final boolean defaultBoolVal; + + + ConfVars(String varname, String defaultVal) { + this.varname = varname; + this.defaultVal = defaultVal; + this.defaultIntVal = -1; + this.defaultLongVal = -1; + this.defaultFloatVal = -1; + this.defaultBoolVal = false; + } + + ConfVars(String varname, int defaultIntVal) { + this.varname = varname; + this.defaultVal = Integer.toString(defaultIntVal); + this.defaultIntVal = defaultIntVal; + this.defaultLongVal = -1; + this.defaultFloatVal = -1; + this.defaultBoolVal = false; + } + + ConfVars(String varname, long defaultLongVal) { + this.varname = varname; + this.defaultVal = Long.toString(defaultLongVal); + this.defaultIntVal = -1; + this.defaultLongVal = defaultLongVal; + this.defaultFloatVal = -1; + this.defaultBoolVal = false; + } + + ConfVars(String varname, float defaultFloatVal) { + this.varname = varname; + this.defaultVal = Float.toString(defaultFloatVal); + this.defaultIntVal = -1; + this.defaultLongVal = -1; + this.defaultFloatVal = defaultFloatVal; + this.defaultBoolVal = false; + } + + ConfVars(String varname, boolean defaultBoolVal) { + this.varname = varname; + this.defaultVal = Boolean.toString(defaultBoolVal); + this.defaultIntVal = -1; + this.defaultLongVal = -1; + this.defaultFloatVal = -1; + this.defaultBoolVal = defaultBoolVal; + } + } + + public static int getIntVar(Configuration conf, ConfVars var) { + return conf.getInt(var.varname, var.defaultIntVal); + } + + public static void setIntVar(Configuration conf, ConfVars var, int val) { + conf.setInt(var.varname, val); + } + + public static long getLongVar(Configuration conf, ConfVars var) { + return conf.getLong(var.varname, var.defaultLongVal); + } + + public static void setLongVar(Configuration conf, ConfVars var, long val) { + conf.setLong(var.varname, val); + } + + public static float getFloatVar(Configuration conf, ConfVars var) { + return conf.getFloat(var.varname, var.defaultFloatVal); + } + + public static void setFloatVar(Configuration conf, ConfVars var, float val) { + conf.setFloat(var.varname, val); + } + + public static boolean getBoolVar(Configuration conf, ConfVars var) { + return conf.getBoolean(var.varname, var.defaultBoolVal); + } + + public static void setBoolVar(Configuration conf, ConfVars var, boolean val) { + conf.setBoolean(var.varname, val); + } + + public static String getVar(Configuration conf, ConfVars var) { + return conf.get(var.varname, var.defaultVal); + } + + public static void setVar(Configuration conf, ConfVars var, String val) { + conf.set(var.varname, val); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java new file mode 100644 index 0000000000..e49c03af90 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java @@ -0,0 +1,443 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; + +import static org.apache.tajo.storage.thirdparty.orc.OrcConf.ConfVars.*; + +import java.io.IOException; + +/** + * Contains factory methods to read or write ORC files. + */ +public final class OrcFile { + + public static final String MAGIC = "ORC"; + + /** + * Create a version number for the ORC file format, so that we can add + * non-forward compatible changes in the future. To make it easier for users + * to understand the version numbers, we use the Hive release number that + * first wrote that version of ORC files. + * + * Thus, if you add new encodings or other non-forward compatible changes + * to ORC files, which prevent the old reader from reading the new format, + * you should change these variable to reflect the next Hive release number. + * Non-forward compatible changes should never be added in patch releases. + * + * Do not make any changes that break backwards compatibility, which would + * prevent the new reader from reading ORC files generated by any released + * version of Hive. + */ + public static enum Version { + V_0_11("0.11", 0, 11), + V_0_12("0.12", 0, 12); + + public static final Version CURRENT = V_0_12; + + private final String name; + private final int major; + private final int minor; + + private Version(String name, int major, int minor) { + this.name = name; + this.major = major; + this.minor = minor; + } + + public static Version byName(String name) { + for(Version version: values()) { + if (version.name.equals(name)) { + return version; + } + } + throw new IllegalArgumentException("Unknown ORC version " + name); + } + + /** + * Get the human readable name for the version. + */ + public String getName() { + return name; + } + + /** + * Get the major version number. + */ + public int getMajor() { + return major; + } + + /** + * Get the minor version number. + */ + public int getMinor() { + return minor; + } + } + + /** + * Records the version of the writer in terms of which bugs have been fixed. + * For bugs in the writer, but the old readers already read the new data + * correctly, bump this version instead of the Version. + */ + public static enum WriterVersion { + ORIGINAL(0), + HIVE_8732(1); // corrupted stripe/file maximum column statistics + + private final int id; + + public int getId() { + return id; + } + + private WriterVersion(int id) { + this.id = id; + } + } + + public static enum EncodingStrategy { + SPEED, COMPRESSION; + } + + public static enum CompressionStrategy { + SPEED, COMPRESSION; + } + + // Note : these string definitions for table properties are deprecated, + // and retained only for backward compatibility, please do not add to + // them, add to OrcTableProperties below instead + @Deprecated public static final String COMPRESSION = "orc.compress"; + @Deprecated public static final String COMPRESSION_BLOCK_SIZE = "orc.compress.size"; + @Deprecated public static final String STRIPE_SIZE = "orc.stripe.size"; + @Deprecated public static final String ROW_INDEX_STRIDE = "orc.row.index.stride"; + @Deprecated public static final String ENABLE_INDEXES = "orc.create.index"; + @Deprecated public static final String BLOCK_PADDING = "orc.block.padding"; + + /** + * Enum container for all orc table properties. + * If introducing a new orc-specific table property, + * add it here. + */ + public static enum OrcTableProperties { + COMPRESSION("orc.compress"), + COMPRESSION_BLOCK_SIZE("orc.compress.size"), + STRIPE_SIZE("orc.stripe.size"), + BLOCK_SIZE("orc.block.size"), + ROW_INDEX_STRIDE("orc.row.index.stride"), + ENABLE_INDEXES("orc.create.index"), + BLOCK_PADDING("orc.block.padding"), + ENCODING_STRATEGY("orc.encoding.strategy"), + BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns"), + BLOOM_FILTER_FPP("orc.bloom.filter.fpp"); + + private final String propName; + + OrcTableProperties(String propName) { + this.propName = propName; + } + + public String getPropName(){ + return this.propName; + } + } + + // unused + private OrcFile() {} + + public static interface WriterContext { + Writer getWriter(); + } + + public static interface WriterCallback { + public void preStripeWrite(WriterContext context) throws IOException; + public void preFooterWrite(WriterContext context) throws IOException; + } + + /** + * Options for creating ORC file writers. + */ + public static class WriterOptions { + private final Configuration configuration; + private FileSystem fileSystemValue = null; + private ObjectInspector inspectorValue = null; + private long stripeSizeValue; + private long blockSizeValue; + private int rowIndexStrideValue; + private int bufferSizeValue; + private boolean blockPaddingValue; + private CompressionKind compressValue; + private MemoryManager memoryManagerValue; + private Version versionValue; + private WriterCallback callback; + private EncodingStrategy encodingStrategy; + private CompressionStrategy compressionStrategy; + private float paddingTolerance; + private String bloomFilterColumns; + private double bloomFilterFpp; + + WriterOptions(Configuration conf) { + configuration = conf; + memoryManagerValue = getMemoryManager(conf); + stripeSizeValue = OrcConf.getLongVar(conf, HIVE_ORC_DEFAULT_STRIPE_SIZE); + blockSizeValue = OrcConf.getLongVar(conf, HIVE_ORC_DEFAULT_BLOCK_SIZE); + rowIndexStrideValue = OrcConf.getIntVar(conf, HIVE_ORC_DEFAULT_ROW_INDEX_STRIDE); + bufferSizeValue = OrcConf.getIntVar(conf, HIVE_ORC_DEFAULT_BUFFER_SIZE); + blockPaddingValue = OrcConf.getBoolVar(conf, HIVE_ORC_DEFAULT_BLOCK_PADDING); + compressValue = CompressionKind.valueOf(OrcConf.getVar(conf, HIVE_ORC_DEFAULT_COMPRESS)); + String versionName = OrcConf.getVar(conf, HIVE_ORC_WRITE_FORMAT); + if (versionName == null) { + versionValue = Version.CURRENT; + } else { + versionValue = Version.byName(versionName); + } + String enString = + conf.get(OrcConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname); + if (enString == null) { + encodingStrategy = EncodingStrategy.SPEED; + } else { + encodingStrategy = EncodingStrategy.valueOf(enString); + } + + String compString = conf + .get(OrcConf.ConfVars.HIVE_ORC_COMPRESSION_STRATEGY.varname); + if (compString == null) { + compressionStrategy = CompressionStrategy.SPEED; + } else { + compressionStrategy = CompressionStrategy.valueOf(compString); + } + + paddingTolerance = conf.getFloat(OrcConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.varname, + OrcConf.ConfVars.HIVE_ORC_BLOCK_PADDING_TOLERANCE.defaultFloatVal); + bloomFilterFpp = BloomFilterIO.DEFAULT_FPP; + } + + /** + * Provide the filesystem for the path, if the client has it available. + * If it is not provided, it will be found from the path. + */ + public WriterOptions fileSystem(FileSystem value) { + fileSystemValue = value; + return this; + } + + /** + * Set the stripe size for the file. The writer stores the contents of the + * stripe in memory until this memory limit is reached and the stripe + * is flushed to the HDFS file and the next stripe started. + */ + public WriterOptions stripeSize(long value) { + stripeSizeValue = value; + return this; + } + + /** + * Set the file system block size for the file. For optimal performance, + * set the block size to be multiple factors of stripe size. + */ + public WriterOptions blockSize(long value) { + blockSizeValue = value; + return this; + } + + /** + * Set the distance between entries in the row index. The minimum value is + * 1000 to prevent the index from overwhelming the data. If the stride is + * set to 0, no indexes will be included in the file. + */ + public WriterOptions rowIndexStride(int value) { + rowIndexStrideValue = value; + return this; + } + + /** + * The size of the memory buffers used for compressing and storing the + * stripe in memory. + */ + public WriterOptions bufferSize(int value) { + bufferSizeValue = value; + return this; + } + + /** + * Sets whether the HDFS blocks are padded to prevent stripes from + * straddling blocks. Padding improves locality and thus the speed of + * reading, but costs space. + */ + public WriterOptions blockPadding(boolean value) { + blockPaddingValue = value; + return this; + } + + /** + * Sets the encoding strategy that is used to encode the data. + */ + public WriterOptions encodingStrategy(EncodingStrategy strategy) { + encodingStrategy = strategy; + return this; + } + + /** + * Sets the tolerance for block padding as a percentage of stripe size. + */ + public WriterOptions paddingTolerance(float value) { + paddingTolerance = value; + return this; + } + + /** + * Comma separated values of column names for which bloom filter is to be created. + */ + public WriterOptions bloomFilterColumns(String columns) { + bloomFilterColumns = columns; + return this; + } + + /** + * Specify the false positive probability for bloom filter. + * @param fpp - false positive probability + * @return + */ + public WriterOptions bloomFilterFpp(double fpp) { + bloomFilterFpp = fpp; + return this; + } + + /** + * Sets the generic compression that is used to compress the data. + */ + public WriterOptions compress(CompressionKind value) { + compressValue = value; + return this; + } + + /** + * A required option that sets the object inspector for the rows. Used + * to determine the schema for the file. + */ + public WriterOptions inspector(ObjectInspector value) { + inspectorValue = value; + return this; + } + + /** + * Sets the version of the file that will be written. + */ + public WriterOptions version(Version value) { + versionValue = value; + return this; + } + + /** + * Add a listener for when the stripe and file are about to be closed. + * @param callback the object to be called when the stripe is closed + * @return + */ + public WriterOptions callback(WriterCallback callback) { + this.callback = callback; + return this; + } + + /** + * A package local option to set the memory manager. + */ + WriterOptions memory(MemoryManager value) { + memoryManagerValue = value; + return this; + } + + } + + /** + * Create a default set of write options that can be modified. + */ + public static WriterOptions writerOptions(Configuration conf) { + return new WriterOptions(conf); + } + + /** + * Create an ORC file writer. This is the public interface for creating + * writers going forward and new options will only be added to this method. + * @param path filename to write to + * @param opts the options + * @return a new ORC file writer + * @throws IOException + */ + public static Writer createWriter(Path path, + WriterOptions opts + ) throws IOException { + FileSystem fs = opts.fileSystemValue == null ? + path.getFileSystem(opts.configuration) : opts.fileSystemValue; + + return new WriterImpl(fs, path, opts.configuration, opts.inspectorValue, + opts.stripeSizeValue, opts.compressValue, + opts.bufferSizeValue, opts.rowIndexStrideValue, + opts.memoryManagerValue, opts.blockPaddingValue, + opts.versionValue, opts.callback, + opts.encodingStrategy, opts.compressionStrategy, + opts.paddingTolerance, opts.blockSizeValue, + opts.bloomFilterColumns, opts.bloomFilterFpp); + } + + /** + * Create an ORC file writer. This method is provided for API backward + * compatability with Hive 0.11. + * @param fs file system + * @param path filename to write to + * @param inspector the ObjectInspector that inspects the rows + * @param stripeSize the number of bytes in a stripe + * @param compress how to compress the file + * @param bufferSize the number of bytes to compress at once + * @param rowIndexStride the number of rows between row index entries or + * 0 to suppress all indexes + * @return a new ORC file writer + * @throws IOException + */ + public static Writer createWriter(FileSystem fs, + Path path, + Configuration conf, + ObjectInspector inspector, + long stripeSize, + CompressionKind compress, + int bufferSize, + int rowIndexStride) throws IOException { + return createWriter(path, + writerOptions(conf) + .fileSystem(fs) + .inspector(inspector) + .stripeSize(stripeSize) + .compress(compress) + .bufferSize(bufferSize) + .rowIndexStride(rowIndexStride)); + } + + private static MemoryManager memoryManager = null; + + private static synchronized MemoryManager getMemoryManager(Configuration conf) { + if (memoryManager == null) { + memoryManager = new MemoryManager(conf); + } + return memoryManager; + } + +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileKeyWrapper.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileKeyWrapper.java new file mode 100644 index 0000000000..ce72cf4c0d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileKeyWrapper.java @@ -0,0 +1,114 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.WritableComparable; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.List; + +/** + * Key for OrcFileMergeMapper task. Contains orc file related information that + * should match before merging two orc files. + */ +public class OrcFileKeyWrapper implements WritableComparable { + + private Path inputPath; + private CompressionKind compression; + private long compressBufferSize; + private List types; + private int rowIndexStride; + private OrcFile.Version version; + private boolean isIncompatFile; + + public boolean isIncompatFile() { + return isIncompatFile; + } + + public void setIsIncompatFile(boolean isIncompatFile) { + this.isIncompatFile = isIncompatFile; + } + + public OrcFile.Version getVersion() { + return version; + } + + public void setVersion(OrcFile.Version version) { + this.version = version; + } + + public int getRowIndexStride() { + return rowIndexStride; + } + + public void setRowIndexStride(int rowIndexStride) { + this.rowIndexStride = rowIndexStride; + } + + public long getCompressBufferSize() { + return compressBufferSize; + } + + public void setCompressBufferSize(long compressBufferSize) { + this.compressBufferSize = compressBufferSize; + } + + public CompressionKind getCompression() { + return compression; + } + + public void setCompression(CompressionKind compression) { + this.compression = compression; + } + + public List getTypes() { + return types; + } + + public void setTypes(List types) { + this.types = types; + } + + public Path getInputPath() { + return inputPath; + } + + public void setInputPath(Path inputPath) { + this.inputPath = inputPath; + } + + @Override + public void write(DataOutput out) throws IOException { + throw new RuntimeException("Not supported."); + } + + @Override + public void readFields(DataInput in) throws IOException { + throw new RuntimeException("Not supported."); + } + + @Override + public int compareTo(OrcFileKeyWrapper o) { + return inputPath.compareTo(o.inputPath); + } + +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileValueWrapper.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileValueWrapper.java new file mode 100644 index 0000000000..77daf6c289 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileValueWrapper.java @@ -0,0 +1,92 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.io.WritableComparable; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.List; + +/** + * Value for OrcFileMergeMapper. Contains stripe related information for the + * current orc file that is being merged. + */ +public class OrcFileValueWrapper implements WritableComparable { + + protected StripeInformation stripeInformation; + protected OrcProto.StripeStatistics stripeStatistics; + protected List userMetadata; + protected boolean lastStripeInFile; + + public List getUserMetadata() { + return userMetadata; + } + + public void setUserMetadata(List userMetadata) { + this.userMetadata = userMetadata; + } + + public boolean isLastStripeInFile() { + return lastStripeInFile; + } + + public void setLastStripeInFile(boolean lastStripeInFile) { + this.lastStripeInFile = lastStripeInFile; + } + + public OrcProto.StripeStatistics getStripeStatistics() { + return stripeStatistics; + } + + public void setStripeStatistics(OrcProto.StripeStatistics stripeStatistics) { + this.stripeStatistics = stripeStatistics; + } + + public StripeInformation getStripeInformation() { + return stripeInformation; + } + + public void setStripeInformation(StripeInformation stripeInformation) { + this.stripeInformation = stripeInformation; + } + + @Override + public void write(DataOutput out) throws IOException { + throw new RuntimeException("Not supported."); + } + + @Override + public void readFields(DataInput in) throws IOException { + throw new RuntimeException("Not supported."); + } + + @Override + public int compareTo(OrcFileValueWrapper o) { + if (stripeInformation.getOffset() < o.getStripeInformation().getOffset()) { + return -1; + } else if (stripeInformation.getOffset() > o.getStripeInformation().getOffset()) { + return 1; + } else { + return 0; + } + } + +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcNewOutputFormat.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcNewOutputFormat.java new file mode 100644 index 0000000000..f5fd2ab78c --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcNewOutputFormat.java @@ -0,0 +1,78 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.mapreduce.RecordWriter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; + +import java.io.IOException; +import java.util.ArrayList; + +/** An OutputFormat that writes ORC files. */ +public class OrcNewOutputFormat extends + FileOutputFormat { + + private static class OrcRecordWriter + extends RecordWriter { + private Writer writer = null; + private final Path path; + private final OrcFile.WriterOptions options; + OrcRecordWriter(Path path, OrcFile.WriterOptions options) { + this.path = path; + this.options = options; + } + @Override + public void write(NullWritable key, OrcSerde.OrcSerdeRow row) + throws IOException, InterruptedException { + if (writer == null) { + options.inspector(row.getInspector()); + writer = OrcFile.createWriter(path, options); + } + writer.addRow(row.getRow()); + } + + @Override + public void close(TaskAttemptContext context) + throws IOException, InterruptedException { + if (writer == null) { + // a row with no columns + ObjectInspector inspector = ObjectInspectorFactory. + getStandardStructObjectInspector(new ArrayList(), + new ArrayList()); + options.inspector(inspector); + writer = OrcFile.createWriter(path, options); + } + writer.close(); + } + } + + @Override + public RecordWriter getRecordWriter(TaskAttemptContext context) + throws IOException, InterruptedException { + Path file = getDefaultWorkFile(context, ""); + return new + OrcRecordWriter(file, OrcFile.writerOptions( + ShimLoader.getHadoopShims().getConfiguration(context))); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcOutputFormat.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcOutputFormat.java new file mode 100644 index 0000000000..eceaa97f7d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcOutputFormat.java @@ -0,0 +1,189 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter; +import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.mapred.FileOutputFormat; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordWriter; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.util.Progressable; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Properties; + +/** + * A Hive OutputFormat for ORC files. + */ +public class OrcOutputFormat extends FileOutputFormat { + private static class OrcRecordWriter + implements RecordWriter, + StatsProvidingRecordWriter { + private Writer writer = null; + private final Path path; + private final OrcFile.WriterOptions options; + private final SerDeStats stats; + + OrcRecordWriter(Path path, OrcFile.WriterOptions options) { + this.path = path; + this.options = options; + this.stats = new SerDeStats(); + } + + @Override + public void write(NullWritable nullWritable, + OrcSerde.OrcSerdeRow row) throws IOException { + if (writer == null) { + options.inspector(row.getInspector()); + writer = OrcFile.createWriter(path, options); + } + writer.addRow(row.getRow()); + } + + @Override + public void write(Writable row) throws IOException { + OrcSerde.OrcSerdeRow serdeRow = (OrcSerde.OrcSerdeRow) row; + if (writer == null) { + options.inspector(serdeRow.getInspector()); + writer = OrcFile.createWriter(path, options); + } + writer.addRow(serdeRow.getRow()); + } + + @Override + public void close(Reporter reporter) throws IOException { + close(true); + } + + @Override + public void close(boolean b) throws IOException { + // if we haven't written any rows, we need to create a file with a + // generic schema. + if (writer == null) { + // a row with no columns + ObjectInspector inspector = ObjectInspectorFactory. + getStandardStructObjectInspector(new ArrayList(), + new ArrayList()); + options.inspector(inspector); + writer = OrcFile.createWriter(path, options); + } + writer.close(); + } + + @Override + public SerDeStats getStats() { + stats.setRawDataSize(writer.getRawDataSize()); + stats.setRowCount(writer.getNumberOfRows()); + return stats; + } + } + + /** + * Helper method to get a parameter first from props if present, falling back to JobConf if not. + * Returns null if key is present in neither. + */ + private String getSettingFromPropsFallingBackToConf(String key, Properties props, JobConf conf){ + if ((props != null) && props.containsKey(key)){ + return props.getProperty(key); + } else if(conf != null) { + // If conf is not null, and the key is not present, Configuration.get() will + // return null for us. So, we don't have to check if it contains it. + return conf.get(key); + } else { + return null; + } + } + + private OrcFile.WriterOptions getOptions(JobConf conf, Properties props) { + OrcFile.WriterOptions options = OrcFile.writerOptions(conf); + String propVal ; + if ((propVal = getSettingFromPropsFallingBackToConf( + OrcFile.OrcTableProperties.STRIPE_SIZE.getPropName(),props,conf)) != null){ + options.stripeSize(Long.parseLong(propVal)); + } + + if ((propVal = getSettingFromPropsFallingBackToConf( + OrcFile.OrcTableProperties.COMPRESSION.getPropName(),props,conf)) != null){ + options.compress(CompressionKind.valueOf(propVal)); + } + + if ((propVal = getSettingFromPropsFallingBackToConf( + OrcFile.OrcTableProperties.COMPRESSION_BLOCK_SIZE.getPropName(),props,conf)) != null){ + options.bufferSize(Integer.parseInt(propVal)); + } + + if ((propVal = getSettingFromPropsFallingBackToConf( + OrcFile.OrcTableProperties.ROW_INDEX_STRIDE.getPropName(),props,conf)) != null){ + options.rowIndexStride(Integer.parseInt(propVal)); + } + + if ((propVal = getSettingFromPropsFallingBackToConf( + OrcFile.OrcTableProperties.ENABLE_INDEXES.getPropName(),props,conf)) != null){ + if ("false".equalsIgnoreCase(propVal)) { + options.rowIndexStride(0); + } + } + + if ((propVal = getSettingFromPropsFallingBackToConf( + OrcFile.OrcTableProperties.BLOCK_PADDING.getPropName(),props,conf)) != null){ + options.blockPadding(Boolean.parseBoolean(propVal)); + } + + if ((propVal = getSettingFromPropsFallingBackToConf( + OrcFile.OrcTableProperties.ENCODING_STRATEGY.getPropName(),props,conf)) != null){ + options.encodingStrategy(OrcFile.EncodingStrategy.valueOf(propVal)); + } + + if ((propVal = getSettingFromPropsFallingBackToConf( + OrcFile.OrcTableProperties.BLOOM_FILTER_COLUMNS.getPropName(), props, conf)) != null) { + options.bloomFilterColumns(propVal); + } + + if ((propVal = getSettingFromPropsFallingBackToConf( + OrcFile.OrcTableProperties.BLOOM_FILTER_FPP.getPropName(), props, conf)) != null) { + options.bloomFilterFpp(Double.parseDouble(propVal)); + } + + return options; + } + + @Override + public RecordWriter + getRecordWriter(FileSystem fileSystem, JobConf conf, String name, + Progressable reporter) throws IOException { + return new + OrcRecordWriter(new Path(name), getOptions(conf,null)); + } + + public StatsProvidingRecordWriter + getHiveRecordWriter(JobConf conf, + Path path, + Class valueClass, + boolean isCompressed, + Properties tableProperties, + Progressable reporter) throws IOException { + return new OrcRecordWriter(path, getOptions(conf,tableProperties)); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcSerde.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcSerde.java new file mode 100644 index 0000000000..087e8a9e9b --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcSerde.java @@ -0,0 +1,156 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedSerde; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.SerDe; +import org.apache.hadoop.hive.serde2.SerDeException; +import org.apache.hadoop.hive.serde2.SerDeStats; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; +import org.apache.hadoop.io.Writable; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Properties; + +/** + * A serde class for ORC. + * It transparently passes the object to/from the ORC file reader/writer. + */ +public class OrcSerde implements SerDe, VectorizedSerde { + + private final OrcSerdeRow row = new OrcSerdeRow(); + private ObjectInspector inspector = null; + + private VectorizedOrcSerde vos = null; + + final class OrcSerdeRow implements Writable { + Object realRow; + ObjectInspector inspector; + + @Override + public void write(DataOutput dataOutput) throws IOException { + throw new UnsupportedOperationException("can't write the bundle"); + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + throw new UnsupportedOperationException("can't read the bundle"); + } + + ObjectInspector getInspector() { + return inspector; + } + + Object getRow() { + return realRow; + } + } + + @Override + public void initialize(Configuration conf, Properties table) { + // Read the configuration parameters + String columnNameProperty = table.getProperty(serdeConstants.LIST_COLUMNS); + // NOTE: if "columns.types" is missing, all columns will be of String type + String columnTypeProperty = table.getProperty(serdeConstants.LIST_COLUMN_TYPES); + + // Parse the configuration parameters + ArrayList columnNames = new ArrayList(); + if (columnNameProperty != null && columnNameProperty.length() > 0) { + for (String name : columnNameProperty.split(",")) { + columnNames.add(name); + } + } + if (columnTypeProperty == null) { + // Default type: all string + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < columnNames.size(); i++) { + if (i > 0) { + sb.append(":"); + } + sb.append("string"); + } + columnTypeProperty = sb.toString(); + } + + ArrayList fieldTypes = + TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); + StructTypeInfo rootType = new StructTypeInfo(); + rootType.setAllStructFieldNames(columnNames); + rootType.setAllStructFieldTypeInfos(fieldTypes); + inspector = OrcStruct.createObjectInspector(rootType); + } + + @Override + public Class getSerializedClass() { + return OrcSerdeRow.class; + } + + @Override + public Writable serialize(Object realRow, ObjectInspector inspector) { + row.realRow = realRow; + row.inspector = inspector; + return row; + } + + @Override + public Object deserialize(Writable writable) throws SerDeException { + return writable; + } + + @Override + public ObjectInspector getObjectInspector() throws SerDeException { + return inspector; + } + + /** + * Always returns null, since serialized size doesn't make sense in the + * context of ORC files. + * + * @return null + */ + @Override + public SerDeStats getSerDeStats() { + return null; + } + + @Override + public Writable serializeVector(VectorizedRowBatch vrg, ObjectInspector objInspector) + throws SerDeException { + if (vos == null) { + vos = new VectorizedOrcSerde(getObjectInspector()); + } + return vos.serialize(vrg, getObjectInspector()); + } + + @Override + public void deserializeVector(Object rowBlob, int rowsInBatch, VectorizedRowBatch reuseBatch) + throws SerDeException { + // nothing to do here + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcStruct.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcStruct.java new file mode 100644 index 0000000000..6c1d779e86 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcStruct.java @@ -0,0 +1,607 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.typeinfo.*; +import org.apache.hadoop.io.Writable; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +final public class OrcStruct implements Writable { + + private Object[] fields; + + OrcStruct(int children) { + fields = new Object[children]; + } + + Object getFieldValue(int fieldIndex) { + return fields[fieldIndex]; + } + + void setFieldValue(int fieldIndex, Object value) { + fields[fieldIndex] = value; + } + + public int getNumFields() { + return fields.length; + } + + /** + * Change the number of fields in the struct. No effect if the number of + * fields is the same. The old field values are copied to the new array. + * @param numFields the new number of fields + */ + public void setNumFields(int numFields) { + if (fields.length != numFields) { + Object[] oldFields = fields; + fields = new Object[numFields]; + System.arraycopy(oldFields, 0, fields, 0, + Math.min(oldFields.length, numFields)); + } + } + + /** + * Destructively make this object link to other's values. + * @param other the value to point to + */ + void linkFields(OrcStruct other) { + fields = other.fields; + } + + @Override + public void write(DataOutput dataOutput) throws IOException { + throw new UnsupportedOperationException("write unsupported"); + } + + @Override + public void readFields(DataInput dataInput) throws IOException { + throw new UnsupportedOperationException("readFields unsupported"); + } + + @Override + public boolean equals(Object other) { + if (other == null || other.getClass() != OrcStruct.class) { + return false; + } else { + OrcStruct oth = (OrcStruct) other; + if (fields.length != oth.fields.length) { + return false; + } + for(int i=0; i < fields.length; ++i) { + if (fields[i] == null) { + if (oth.fields[i] != null) { + return false; + } + } else { + if (!fields[i].equals(oth.fields[i])) { + return false; + } + } + } + return true; + } + } + + @Override + public int hashCode() { + int result = fields.length; + for(Object field: fields) { + if (field != null) { + result ^= field.hashCode(); + } + } + return result; + } + + @Override + public String toString() { + StringBuilder buffer = new StringBuilder(); + buffer.append("{"); + for(int i=0; i < fields.length; ++i) { + if (i != 0) { + buffer.append(", "); + } + buffer.append(fields[i]); + } + buffer.append("}"); + return buffer.toString(); + } + + static class Field implements StructField { + private final String name; + private final ObjectInspector inspector; + private final int offset; + + Field(String name, ObjectInspector inspector, int offset) { + this.name = name; + this.inspector = inspector; + this.offset = offset; + } + + @Override + public String getFieldName() { + return name; + } + + @Override + public ObjectInspector getFieldObjectInspector() { + return inspector; + } + + public int getFieldID() { + return offset; + } + + @Override + public String getFieldComment() { + return null; + } + } + + static class OrcStructInspector extends SettableStructObjectInspector { + private List fields; + + protected OrcStructInspector() { + super(); + } + + OrcStructInspector(List fields) { + this.fields = fields; + } + + OrcStructInspector(StructTypeInfo info) { + ArrayList fieldNames = info.getAllStructFieldNames(); + ArrayList fieldTypes = info.getAllStructFieldTypeInfos(); + fields = new ArrayList(fieldNames.size()); + for(int i=0; i < fieldNames.size(); ++i) { + fields.add(new Field(fieldNames.get(i), + createObjectInspector(fieldTypes.get(i)), i)); + } + } + + OrcStructInspector(int columnId, List types) { + OrcProto.Type type = types.get(columnId); + int fieldCount = type.getSubtypesCount(); + fields = new ArrayList(fieldCount); + for(int i=0; i < fieldCount; ++i) { + int fieldType = type.getSubtypes(i); + fields.add(new Field(type.getFieldNames(i), + createObjectInspector(fieldType, types), i)); + } + } + + @Override + public List getAllStructFieldRefs() { + return fields; + } + + @Override + public StructField getStructFieldRef(String s) { + for(StructField field: fields) { + if (field.getFieldName().equalsIgnoreCase(s)) { + return field; + } + } + return null; + } + + @Override + public Object getStructFieldData(Object object, StructField field) { + if (object == null) { + return null; + } + int offset = ((Field) field).offset; + OrcStruct struct = (OrcStruct) object; + if (offset >= struct.fields.length) { + return null; + } + + return struct.fields[offset]; + } + + @Override + public List getStructFieldsDataAsList(Object object) { + if (object == null) { + return null; + } + OrcStruct struct = (OrcStruct) object; + List result = new ArrayList(struct.fields.length); + for (Object child: struct.fields) { + result.add(child); + } + return result; + } + + @Override + public String getTypeName() { + StringBuilder buffer = new StringBuilder(); + buffer.append("struct<"); + for(int i=0; i < fields.size(); ++i) { + StructField field = fields.get(i); + if (i != 0) { + buffer.append(","); + } + buffer.append(field.getFieldName()); + buffer.append(":"); + buffer.append(field.getFieldObjectInspector().getTypeName()); + } + buffer.append(">"); + return buffer.toString(); + } + + @Override + public Category getCategory() { + return Category.STRUCT; + } + + @Override + public Object create() { + return new OrcStruct(0); + } + + @Override + public Object setStructFieldData(Object struct, StructField field, + Object fieldValue) { + OrcStruct orcStruct = (OrcStruct) struct; + int offset = ((Field) field).offset; + // if the offset is bigger than our current number of fields, grow it + if (orcStruct.getNumFields() <= offset) { + orcStruct.setNumFields(offset+1); + } + orcStruct.setFieldValue(offset, fieldValue); + return struct; + } + + @Override + public boolean equals(Object o) { + if (o == null || o.getClass() != getClass()) { + return false; + } else if (o == this) { + return true; + } else { + List other = ((OrcStructInspector) o).fields; + if (other.size() != fields.size()) { + return false; + } + for(int i = 0; i < fields.size(); ++i) { + StructField left = other.get(i); + StructField right = fields.get(i); + if (!(left.getFieldName().equalsIgnoreCase(right.getFieldName()) && + left.getFieldObjectInspector().equals + (right.getFieldObjectInspector()))) { + return false; + } + } + return true; + } + } + } + + static class OrcMapObjectInspector + implements MapObjectInspector, SettableMapObjectInspector { + private ObjectInspector key; + private ObjectInspector value; + + private OrcMapObjectInspector() { + super(); + } + OrcMapObjectInspector(MapTypeInfo info) { + key = createObjectInspector(info.getMapKeyTypeInfo()); + value = createObjectInspector(info.getMapValueTypeInfo()); + } + + OrcMapObjectInspector(int columnId, List types) { + OrcProto.Type type = types.get(columnId); + key = createObjectInspector(type.getSubtypes(0), types); + value = createObjectInspector(type.getSubtypes(1), types); + } + + @Override + public ObjectInspector getMapKeyObjectInspector() { + return key; + } + + @Override + public ObjectInspector getMapValueObjectInspector() { + return value; + } + + @Override + public Object getMapValueElement(Object map, Object key) { + return ((map == null || key == null)? null : ((Map) map).get(key)); + } + + @Override + @SuppressWarnings("unchecked") + public Map getMap(Object map) { + if (map == null) { + return null; + } + return (Map) map; + } + + @Override + public int getMapSize(Object map) { + if (map == null) { + return -1; + } + return ((Map) map).size(); + } + + @Override + public String getTypeName() { + return "map<" + key.getTypeName() + "," + value.getTypeName() + ">"; + } + + @Override + public Category getCategory() { + return Category.MAP; + } + + @Override + public Object create() { + return new HashMap(); + } + + @Override + public Object put(Object map, Object key, Object value) { + ((Map) map).put(key, value); + return map; + } + + @Override + public Object remove(Object map, Object key) { + ((Map) map).remove(key); + return map; + } + + @Override + public Object clear(Object map) { + ((Map) map).clear(); + return map; + } + + @Override + public boolean equals(Object o) { + if (o == null || o.getClass() != getClass()) { + return false; + } else if (o == this) { + return true; + } else { + OrcMapObjectInspector other = (OrcMapObjectInspector) o; + return other.key.equals(key) && other.value.equals(value); + } + } + } + + static class OrcListObjectInspector + implements ListObjectInspector, SettableListObjectInspector { + private ObjectInspector child; + + private OrcListObjectInspector() { + super(); + } + OrcListObjectInspector(ListTypeInfo info) { + child = createObjectInspector(info.getListElementTypeInfo()); + } + + OrcListObjectInspector(int columnId, List types) { + OrcProto.Type type = types.get(columnId); + child = createObjectInspector(type.getSubtypes(0), types); + } + + @Override + public ObjectInspector getListElementObjectInspector() { + return child; + } + + @Override + public Object getListElement(Object list, int i) { + if (list == null) { + return null; + } + return ((List) list).get(i); + } + + @Override + public int getListLength(Object list) { + if (list == null) { + return -1; + } + return ((List) list).size(); + } + + @Override + @SuppressWarnings("unchecked") + public List getList(Object list) { + if (list == null) { + return null; + } + return (List) list; + } + + @Override + public String getTypeName() { + return "array<" + child.getTypeName() + ">"; + } + + @Override + public Category getCategory() { + return Category.LIST; + } + + @Override + public Object create(int size) { + ArrayList result = new ArrayList(size); + for(int i = 0; i < size; ++i) { + result.add(null); + } + return result; + } + + @Override + public Object set(Object list, int index, Object element) { + List l = (List) list; + for(int i=l.size(); i < index+1; ++i) { + l.add(null); + } + l.set(index, element); + return list; + } + + @Override + public Object resize(Object list, int newSize) { + ((ArrayList) list).ensureCapacity(newSize); + return list; + } + + @Override + public boolean equals(Object o) { + if (o == null || o.getClass() != getClass()) { + return false; + } else if (o == this) { + return true; + } else { + ObjectInspector other = ((OrcListObjectInspector) o).child; + return other.equals(child); + } + } + } + + static public ObjectInspector createObjectInspector(TypeInfo info) { + switch (info.getCategory()) { + case PRIMITIVE: + switch (((PrimitiveTypeInfo) info).getPrimitiveCategory()) { + case FLOAT: + return PrimitiveObjectInspectorFactory.writableFloatObjectInspector; + case DOUBLE: + return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; + case BOOLEAN: + return PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; + case BYTE: + return PrimitiveObjectInspectorFactory.writableByteObjectInspector; + case SHORT: + return PrimitiveObjectInspectorFactory.writableShortObjectInspector; + case INT: + return PrimitiveObjectInspectorFactory.writableIntObjectInspector; + case LONG: + return PrimitiveObjectInspectorFactory.writableLongObjectInspector; + case BINARY: + return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; + case STRING: + return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + case CHAR: + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( + (PrimitiveTypeInfo) info); + case VARCHAR: + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( + (PrimitiveTypeInfo) info); + case TIMESTAMP: + return PrimitiveObjectInspectorFactory.writableTimestampObjectInspector; + case DATE: + return PrimitiveObjectInspectorFactory.writableDateObjectInspector; + case DECIMAL: + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( + (PrimitiveTypeInfo) info); + default: + throw new IllegalArgumentException("Unknown primitive type " + + ((PrimitiveTypeInfo) info).getPrimitiveCategory()); + } + case STRUCT: + return new OrcStructInspector((StructTypeInfo) info); + case UNION: + return new OrcUnion.OrcUnionObjectInspector((UnionTypeInfo) info); + case MAP: + return new OrcMapObjectInspector((MapTypeInfo) info); + case LIST: + return new OrcListObjectInspector((ListTypeInfo) info); + default: + throw new IllegalArgumentException("Unknown type " + + info.getCategory()); + } + } + + static ObjectInspector createObjectInspector(int columnId, + List types){ + OrcProto.Type type = types.get(columnId); + switch (type.getKind()) { + case FLOAT: + return PrimitiveObjectInspectorFactory.writableFloatObjectInspector; + case DOUBLE: + return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; + case BOOLEAN: + return PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; + case BYTE: + return PrimitiveObjectInspectorFactory.writableByteObjectInspector; + case SHORT: + return PrimitiveObjectInspectorFactory.writableShortObjectInspector; + case INT: + return PrimitiveObjectInspectorFactory.writableIntObjectInspector; + case LONG: + return PrimitiveObjectInspectorFactory.writableLongObjectInspector; + case BINARY: + return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; + case STRING: + return PrimitiveObjectInspectorFactory.writableStringObjectInspector; + case CHAR: + if (!type.hasMaximumLength()) { + throw new UnsupportedOperationException( + "Illegal use of char type without length in ORC type definition."); + } + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( + TypeInfoFactory.getCharTypeInfo(type.getMaximumLength())); + case VARCHAR: + if (!type.hasMaximumLength()) { + throw new UnsupportedOperationException( + "Illegal use of varchar type without length in ORC type definition."); + } + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( + TypeInfoFactory.getVarcharTypeInfo(type.getMaximumLength())); + case TIMESTAMP: + return PrimitiveObjectInspectorFactory.writableTimestampObjectInspector; + case DATE: + return PrimitiveObjectInspectorFactory.writableDateObjectInspector; + case DECIMAL: + int precision = type.hasPrecision() ? type.getPrecision() : HiveDecimal.SYSTEM_DEFAULT_PRECISION; + int scale = type.hasScale()? type.getScale() : HiveDecimal.SYSTEM_DEFAULT_SCALE; + return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( + TypeInfoFactory.getDecimalTypeInfo(precision, scale)); + case STRUCT: + return new OrcStructInspector(columnId, types); + case UNION: + return new OrcUnion.OrcUnionObjectInspector(columnId, types); + case MAP: + return new OrcMapObjectInspector(columnId, types); + case LIST: + return new OrcListObjectInspector(columnId, types); + default: + throw new UnsupportedOperationException("Unknown type " + + type.getKind()); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUnion.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUnion.java new file mode 100644 index 0000000000..1bc2b5d38e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUnion.java @@ -0,0 +1,160 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObject; +import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; + +import java.util.ArrayList; +import java.util.List; + +/** + * An in-memory representation of a union type. + */ +final class OrcUnion implements UnionObject { + private byte tag; + private Object object; + + void set(byte tag, Object object) { + this.tag = tag; + this.object = object; + } + + @Override + public byte getTag() { + return tag; + } + + @Override + public Object getObject() { + return object; + } + + @Override + public boolean equals(Object other) { + if (other == null || other.getClass() != OrcUnion.class) { + return false; + } + OrcUnion oth = (OrcUnion) other; + if (tag != oth.tag) { + return false; + } else if (object == null) { + return oth.object == null; + } else { + return object.equals(oth.object); + } + } + + @Override + public int hashCode() { + int result = tag; + if (object != null) { + result ^= object.hashCode(); + } + return result; + } + + @Override + public String toString() { + return "union(" + Integer.toString(tag & 0xff) + ", " + object + ")"; + } + + static class OrcUnionObjectInspector implements UnionObjectInspector { + private List children; + + protected OrcUnionObjectInspector() { + super(); + } + OrcUnionObjectInspector(int columnId, + List types) { + OrcProto.Type type = types.get(columnId); + children = new ArrayList(type.getSubtypesCount()); + for(int i=0; i < type.getSubtypesCount(); ++i) { + children.add(OrcStruct.createObjectInspector(type.getSubtypes(i), + types)); + } + } + + OrcUnionObjectInspector(UnionTypeInfo info) { + List unionChildren = info.getAllUnionObjectTypeInfos(); + this.children = new ArrayList(unionChildren.size()); + for(TypeInfo child: info.getAllUnionObjectTypeInfos()) { + this.children.add(OrcStruct.createObjectInspector(child)); + } + } + + @Override + public List getObjectInspectors() { + return children; + } + + @Override + public byte getTag(Object obj) { + return ((OrcUnion) obj).tag; + } + + @Override + public Object getField(Object obj) { + return ((OrcUnion) obj).object; + } + + @Override + public String getTypeName() { + StringBuilder builder = new StringBuilder("uniontype<"); + boolean first = true; + for(ObjectInspector child: children) { + if (first) { + first = false; + } else { + builder.append(","); + } + builder.append(child.getTypeName()); + } + builder.append(">"); + return builder.toString(); + } + + @Override + public Category getCategory() { + return Category.UNION; + } + + @Override + public boolean equals(Object o) { + if (o == null || o.getClass() != getClass()) { + return false; + } else if (o == this) { + return true; + } else { + List other = ((OrcUnionObjectInspector) o).children; + if (other.size() != children.size()) { + return false; + } + for(int i = 0; i < children.size(); ++i) { + if (!other.get(i).equals(children.get(i))) { + return false; + } + } + return true; + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java new file mode 100644 index 0000000000..847c10cf69 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUtils.java @@ -0,0 +1,201 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.collect.Lists; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.serde2.objectinspector.*; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class OrcUtils { + private static final Log LOG = LogFactory.getLog(OrcUtils.class); + + /** + * Returns selected columns as a boolean array with true value set for specified column names. + * The result will contain number of elements equal to flattened number of columns. + * For example: + * selectedColumns - a,b,c + * allColumns - a,b,c,d + * If column c is a complex type, say list and other types are primitives then result will + * be [false, true, true, true, true, true, false] + * Index 0 is the root element of the struct which is set to false by default, index 1,2 + * corresponds to columns a and b. Index 3,4 correspond to column c which is list and + * index 5 correspond to column d. After flattening list gets 2 columns. + * + * @param selectedColumns - comma separated list of selected column names + * @param allColumns - comma separated list of all column names + * @param inspector - object inspector + * @return - boolean array with true value set for the specified column names + */ + public static boolean[] includeColumns(String selectedColumns, String allColumns, + ObjectInspector inspector) { + int numFlattenedCols = getFlattenedColumnsCount(inspector); + boolean[] results = new boolean[numFlattenedCols]; + if ("*".equals(selectedColumns)) { + Arrays.fill(results, true); + return results; + } + if (selectedColumns != null && !selectedColumns.isEmpty()) { + includeColumnsImpl(results, selectedColumns.toLowerCase(), allColumns, inspector); + } + return results; + } + + private static void includeColumnsImpl(boolean[] includeColumns, String selectedColumns, + String allColumns, + ObjectInspector inspector) { + Map> columnSpanMap = getColumnSpan(allColumns, inspector); + LOG.info("columnSpanMap: " + columnSpanMap); + + String[] selCols = selectedColumns.split(","); + for (String sc : selCols) { + if (columnSpanMap.containsKey(sc)) { + List colSpan = columnSpanMap.get(sc); + int start = colSpan.get(0); + int end = colSpan.get(1); + for (int i = start; i <= end; i++) { + includeColumns[i] = true; + } + } + } + + LOG.info("includeColumns: " + Arrays.toString(includeColumns)); + } + + private static Map> getColumnSpan(String allColumns, + ObjectInspector inspector) { + // map that contains the column span for each column. Column span is the number of columns + // required after flattening. For a given object inspector this map contains the start column + // id and end column id (both inclusive) after flattening. + // EXAMPLE: + // schema: struct> + // column span map for the above struct will be + // a => [1,1], b => [2,2], c => [3,5] + Map> columnSpanMap = new HashMap>(); + if (allColumns != null) { + String[] columns = allColumns.split(","); + int startIdx = 0; + int endIdx = 0; + if (inspector instanceof StructObjectInspector) { + StructObjectInspector soi = (StructObjectInspector) inspector; + List fields = soi.getAllStructFieldRefs(); + for (int i = 0; i < fields.size(); i++) { + StructField sf = fields.get(i); + + // we get the type (category) from object inspector but column name from the argument. + // The reason for this is hive (FileSinkOperator) does not pass the actual column names, + // instead it passes the internal column names (_col1,_col2). + ObjectInspector sfOI = sf.getFieldObjectInspector(); + String colName = columns[i]; + + startIdx = endIdx + 1; + switch (sfOI.getCategory()) { + case PRIMITIVE: + endIdx += 1; + break; + case STRUCT: + endIdx += 1; + StructObjectInspector structInsp = (StructObjectInspector) sfOI; + List structFields = structInsp.getAllStructFieldRefs(); + for (int j = 0; j < structFields.size(); ++j) { + endIdx += getFlattenedColumnsCount(structFields.get(j).getFieldObjectInspector()); + } + break; + case MAP: + endIdx += 1; + MapObjectInspector mapInsp = (MapObjectInspector) sfOI; + endIdx += getFlattenedColumnsCount(mapInsp.getMapKeyObjectInspector()); + endIdx += getFlattenedColumnsCount(mapInsp.getMapValueObjectInspector()); + break; + case LIST: + endIdx += 1; + ListObjectInspector listInsp = (ListObjectInspector) sfOI; + endIdx += getFlattenedColumnsCount(listInsp.getListElementObjectInspector()); + break; + case UNION: + endIdx += 1; + UnionObjectInspector unionInsp = (UnionObjectInspector) sfOI; + List choices = unionInsp.getObjectInspectors(); + for (int j = 0; j < choices.size(); ++j) { + endIdx += getFlattenedColumnsCount(choices.get(j)); + } + break; + default: + throw new IllegalArgumentException("Bad category: " + + inspector.getCategory()); + } + + columnSpanMap.put(colName, Lists.newArrayList(startIdx, endIdx)); + } + } + } + return columnSpanMap; + } + + /** + * Returns the number of columns after flatting complex types. + * + * @param inspector - object inspector + * @return + */ + public static int getFlattenedColumnsCount(ObjectInspector inspector) { + int numWriters = 0; + switch (inspector.getCategory()) { + case PRIMITIVE: + numWriters += 1; + break; + case STRUCT: + numWriters += 1; + StructObjectInspector structInsp = (StructObjectInspector) inspector; + List fields = structInsp.getAllStructFieldRefs(); + for (int i = 0; i < fields.size(); ++i) { + numWriters += getFlattenedColumnsCount(fields.get(i).getFieldObjectInspector()); + } + break; + case MAP: + numWriters += 1; + MapObjectInspector mapInsp = (MapObjectInspector) inspector; + numWriters += getFlattenedColumnsCount(mapInsp.getMapKeyObjectInspector()); + numWriters += getFlattenedColumnsCount(mapInsp.getMapValueObjectInspector()); + break; + case LIST: + numWriters += 1; + ListObjectInspector listInsp = (ListObjectInspector) inspector; + numWriters += getFlattenedColumnsCount(listInsp.getListElementObjectInspector()); + break; + case UNION: + numWriters += 1; + UnionObjectInspector unionInsp = (UnionObjectInspector) inspector; + List choices = unionInsp.getObjectInspectors(); + for (int i = 0; i < choices.size(); ++i) { + numWriters += getFlattenedColumnsCount(choices.get(i)); + } + break; + default: + throw new IllegalArgumentException("Bad category: " + + inspector.getCategory()); + } + return numWriters; + } + +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OutStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OutStream.java new file mode 100644 index 0000000000..f6cfd579b0 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OutStream.java @@ -0,0 +1,286 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import java.io.IOException; +import java.nio.ByteBuffer; + +class OutStream extends PositionedOutputStream { + + interface OutputReceiver { + /** + * Output the given buffer to the final destination + * @param buffer the buffer to output + * @throws IOException + */ + void output(ByteBuffer buffer) throws IOException; + } + + static final int HEADER_SIZE = 3; + private final String name; + private final OutputReceiver receiver; + // if enabled the stream will be suppressed when writing stripe + private boolean suppress; + + /** + * Stores the uncompressed bytes that have been serialized, but not + * compressed yet. When this fills, we compress the entire buffer. + */ + private ByteBuffer current = null; + + /** + * Stores the compressed bytes until we have a full buffer and then outputs + * them to the receiver. If no compression is being done, this (and overflow) + * will always be null and the current buffer will be sent directly to the + * receiver. + */ + private ByteBuffer compressed = null; + + /** + * Since the compressed buffer may start with contents from previous + * compression blocks, we allocate an overflow buffer so that the + * output of the codec can be split between the two buffers. After the + * compressed buffer is sent to the receiver, the overflow buffer becomes + * the new compressed buffer. + */ + private ByteBuffer overflow = null; + private final int bufferSize; + private final CompressionCodec codec; + private long compressedBytes = 0; + private long uncompressedBytes = 0; + + OutStream(String name, + int bufferSize, + CompressionCodec codec, + OutputReceiver receiver) throws IOException { + this.name = name; + this.bufferSize = bufferSize; + this.codec = codec; + this.receiver = receiver; + this.suppress = false; + } + + public void clear() throws IOException { + flush(); + suppress = false; + } + + /** + * Write the length of the compressed bytes. Life is much easier if the + * header is constant length, so just use 3 bytes. Considering most of the + * codecs want between 32k (snappy) and 256k (lzo, zlib), 3 bytes should + * be plenty. We also use the low bit for whether it is the original or + * compressed bytes. + * @param buffer the buffer to write the header to + * @param position the position in the buffer to write at + * @param val the size in the file + * @param original is it uncompressed + */ + private static void writeHeader(ByteBuffer buffer, + int position, + int val, + boolean original) { + buffer.put(position, (byte) ((val << 1) + (original ? 1 : 0))); + buffer.put(position + 1, (byte) (val >> 7)); + buffer.put(position + 2, (byte) (val >> 15)); + } + + private void getNewInputBuffer() throws IOException { + if (codec == null) { + current = ByteBuffer.allocate(bufferSize); + } else { + current = ByteBuffer.allocate(bufferSize + HEADER_SIZE); + writeHeader(current, 0, bufferSize, true); + current.position(HEADER_SIZE); + } + } + + /** + * Allocate a new output buffer if we are compressing. + */ + private ByteBuffer getNewOutputBuffer() throws IOException { + return ByteBuffer.allocate(bufferSize + HEADER_SIZE); + } + + private void flip() throws IOException { + current.limit(current.position()); + current.position(codec == null ? 0 : HEADER_SIZE); + } + + @Override + public void write(int i) throws IOException { + if (current == null) { + getNewInputBuffer(); + } + if (current.remaining() < 1) { + spill(); + } + uncompressedBytes += 1; + current.put((byte) i); + } + + @Override + public void write(byte[] bytes, int offset, int length) throws IOException { + if (current == null) { + getNewInputBuffer(); + } + int remaining = Math.min(current.remaining(), length); + current.put(bytes, offset, remaining); + uncompressedBytes += remaining; + length -= remaining; + while (length != 0) { + spill(); + offset += remaining; + remaining = Math.min(current.remaining(), length); + current.put(bytes, offset, remaining); + uncompressedBytes += remaining; + length -= remaining; + } + } + + private void spill() throws IOException { + // if there isn't anything in the current buffer, don't spill + if (current == null || + current.position() == (codec == null ? 0 : HEADER_SIZE)) { + return; + } + flip(); + if (codec == null) { + receiver.output(current); + getNewInputBuffer(); + } else { + if (compressed == null) { + compressed = getNewOutputBuffer(); + } else if (overflow == null) { + overflow = getNewOutputBuffer(); + } + int sizePosn = compressed.position(); + compressed.position(compressed.position() + HEADER_SIZE); + if (codec.compress(current, compressed, overflow)) { + uncompressedBytes = 0; + // move position back to after the header + current.position(HEADER_SIZE); + current.limit(current.capacity()); + // find the total bytes in the chunk + int totalBytes = compressed.position() - sizePosn - HEADER_SIZE; + if (overflow != null) { + totalBytes += overflow.position(); + } + compressedBytes += totalBytes + HEADER_SIZE; + writeHeader(compressed, sizePosn, totalBytes, false); + // if we have less than the next header left, spill it. + if (compressed.remaining() < HEADER_SIZE) { + compressed.flip(); + receiver.output(compressed); + compressed = overflow; + overflow = null; + } + } else { + compressedBytes += uncompressedBytes + HEADER_SIZE; + uncompressedBytes = 0; + // we are using the original, but need to spill the current + // compressed buffer first. So back up to where we started, + // flip it and add it to done. + if (sizePosn != 0) { + compressed.position(sizePosn); + compressed.flip(); + receiver.output(compressed); + compressed = null; + // if we have an overflow, clear it and make it the new compress + // buffer + if (overflow != null) { + overflow.clear(); + compressed = overflow; + overflow = null; + } + } else { + compressed.clear(); + if (overflow != null) { + overflow.clear(); + } + } + + // now add the current buffer into the done list and get a new one. + current.position(0); + // update the header with the current length + writeHeader(current, 0, current.limit() - HEADER_SIZE, true); + receiver.output(current); + getNewInputBuffer(); + } + } + } + + void getPosition(PositionRecorder recorder) throws IOException { + if (codec == null) { + recorder.addPosition(uncompressedBytes); + } else { + recorder.addPosition(compressedBytes); + recorder.addPosition(uncompressedBytes); + } + } + + @Override + public void flush() throws IOException { + spill(); + if (compressed != null && compressed.position() != 0) { + compressed.flip(); + receiver.output(compressed); + compressed = null; + } + uncompressedBytes = 0; + compressedBytes = 0; + overflow = null; + current = null; + } + + @Override + public String toString() { + return name; + } + + @Override + public long getBufferSize() { + long result = 0; + if (current != null) { + result += current.capacity(); + } + if (compressed != null) { + result += compressed.capacity(); + } + if (overflow != null) { + result += overflow.capacity(); + } + return result; + } + + /** + * Set suppress flag + */ + public void suppress() { + suppress = true; + } + + /** + * Returns the state of suppress flag + * @return value of suppress flag + */ + public boolean isSuppressed() { + return suppress; + } +} + diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionProvider.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionProvider.java new file mode 100644 index 0000000000..54b5ab6da1 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionProvider.java @@ -0,0 +1,26 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +/** + * An interface used for seeking to a row index. + */ +public interface PositionProvider { + long getNext(); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionRecorder.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionRecorder.java new file mode 100644 index 0000000000..a39926e005 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionRecorder.java @@ -0,0 +1,25 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +/** + * An interface for recording positions in a stream. + */ +interface PositionRecorder { + void addPosition(long offset); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionedOutputStream.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionedOutputStream.java new file mode 100644 index 0000000000..748c98cfbb --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionedOutputStream.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import java.io.IOException; +import java.io.OutputStream; + +abstract class PositionedOutputStream extends OutputStream { + + /** + * Record the current position to the recorder. + * @param recorder the object that receives the position + * @throws IOException + */ + abstract void getPosition(PositionRecorder recorder) throws IOException; + + /** + * Get the memory size currently allocated as buffer associated with this + * stream. + * @return the number of bytes used by buffers. + */ + abstract long getBufferSize(); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RedBlackTree.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RedBlackTree.java new file mode 100644 index 0000000000..2482f93b0b --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RedBlackTree.java @@ -0,0 +1,309 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +/** + * A memory efficient red-black tree that does not allocate any objects per + * an element. This class is abstract and assumes that the child class + * handles the key and comparisons with the key. + */ +abstract class RedBlackTree { + public static final int NULL = -1; + + // Various values controlling the offset of the data within the array. + private static final int LEFT_OFFSET = 0; + private static final int RIGHT_OFFSET = 1; + private static final int ELEMENT_SIZE = 2; + + protected int size = 0; + private final DynamicIntArray data; + protected int root = NULL; + protected int lastAdd = 0; + private boolean wasAdd = false; + + /** + * Create a set with the given initial capacity. + */ + public RedBlackTree(int initialCapacity) { + data = new DynamicIntArray(initialCapacity * ELEMENT_SIZE); + } + + /** + * Insert a new node into the data array, growing the array as necessary. + * + * @return Returns the position of the new node. + */ + private int insert(int left, int right, boolean isRed) { + int position = size; + size += 1; + setLeft(position, left, isRed); + setRight(position, right); + return position; + } + + /** + * Compare the value at the given position to the new value. + * @return 0 if the values are the same, -1 if the new value is smaller and + * 1 if the new value is larger. + */ + protected abstract int compareValue(int position); + + /** + * Is the given node red as opposed to black? To prevent having an extra word + * in the data array, we just the low bit on the left child index. + */ + protected boolean isRed(int position) { + return position != NULL && + (data.get(position * ELEMENT_SIZE + LEFT_OFFSET) & 1) == 1; + } + + /** + * Set the red bit true or false. + */ + private void setRed(int position, boolean isRed) { + int offset = position * ELEMENT_SIZE + LEFT_OFFSET; + if (isRed) { + data.set(offset, data.get(offset) | 1); + } else { + data.set(offset, data.get(offset) & ~1); + } + } + + /** + * Get the left field of the given position. + */ + protected int getLeft(int position) { + return data.get(position * ELEMENT_SIZE + LEFT_OFFSET) >> 1; + } + + /** + * Get the right field of the given position. + */ + protected int getRight(int position) { + return data.get(position * ELEMENT_SIZE + RIGHT_OFFSET); + } + + /** + * Set the left field of the given position. + * Note that we are storing the node color in the low bit of the left pointer. + */ + private void setLeft(int position, int left) { + int offset = position * ELEMENT_SIZE + LEFT_OFFSET; + data.set(offset, (left << 1) | (data.get(offset) & 1)); + } + + /** + * Set the left field of the given position. + * Note that we are storing the node color in the low bit of the left pointer. + */ + private void setLeft(int position, int left, boolean isRed) { + int offset = position * ELEMENT_SIZE + LEFT_OFFSET; + data.set(offset, (left << 1) | (isRed ? 1 : 0)); + } + + /** + * Set the right field of the given position. + */ + private void setRight(int position, int right) { + data.set(position * ELEMENT_SIZE + RIGHT_OFFSET, right); + } + + /** + * Insert or find a given key in the tree and rebalance the tree correctly. + * Rebalancing restores the red-black aspect of the tree to maintain the + * invariants: + * 1. If a node is red, both of its children are black. + * 2. Each child of a node has the same black height (the number of black + * nodes between it and the leaves of the tree). + * + * Inserted nodes are at the leaves and are red, therefore there is at most a + * violation of rule 1 at the node we just put in. Instead of always keeping + * the parents, this routine passing down the context. + * + * The fix is broken down into 6 cases (1.{1,2,3} and 2.{1,2,3} that are + * left-right mirror images of each other). See Algorighms by Cormen, + * Leiserson, and Rivest for the explaination of the subcases. + * + * @param node The node that we are fixing right now. + * @param fromLeft Did we come down from the left? + * @param parent Nodes' parent + * @param grandparent Parent's parent + * @param greatGrandparent Grandparent's parent + * @return Does parent also need to be checked and/or fixed? + */ + private boolean add(int node, boolean fromLeft, int parent, + int grandparent, int greatGrandparent) { + if (node == NULL) { + if (root == NULL) { + lastAdd = insert(NULL, NULL, false); + root = lastAdd; + wasAdd = true; + return false; + } else { + lastAdd = insert(NULL, NULL, true); + node = lastAdd; + wasAdd = true; + // connect the new node into the tree + if (fromLeft) { + setLeft(parent, node); + } else { + setRight(parent, node); + } + } + } else { + int compare = compareValue(node); + boolean keepGoing; + + // Recurse down to find where the node needs to be added + if (compare < 0) { + keepGoing = add(getLeft(node), true, node, parent, grandparent); + } else if (compare > 0) { + keepGoing = add(getRight(node), false, node, parent, grandparent); + } else { + lastAdd = node; + wasAdd = false; + return false; + } + + // we don't need to fix the root (because it is always set to black) + if (node == root || !keepGoing) { + return false; + } + } + + + // Do we need to fix this node? Only if there are two reds right under each + // other. + if (isRed(node) && isRed(parent)) { + if (parent == getLeft(grandparent)) { + int uncle = getRight(grandparent); + if (isRed(uncle)) { + // case 1.1 + setRed(parent, false); + setRed(uncle, false); + setRed(grandparent, true); + return true; + } else { + if (node == getRight(parent)) { + // case 1.2 + // swap node and parent + int tmp = node; + node = parent; + parent = tmp; + // left-rotate on node + setLeft(grandparent, parent); + setRight(node, getLeft(parent)); + setLeft(parent, node); + } + + // case 1.2 and 1.3 + setRed(parent, false); + setRed(grandparent, true); + + // right-rotate on grandparent + if (greatGrandparent == NULL) { + root = parent; + } else if (getLeft(greatGrandparent) == grandparent) { + setLeft(greatGrandparent, parent); + } else { + setRight(greatGrandparent, parent); + } + setLeft(grandparent, getRight(parent)); + setRight(parent, grandparent); + return false; + } + } else { + int uncle = getLeft(grandparent); + if (isRed(uncle)) { + // case 2.1 + setRed(parent, false); + setRed(uncle, false); + setRed(grandparent, true); + return true; + } else { + if (node == getLeft(parent)) { + // case 2.2 + // swap node and parent + int tmp = node; + node = parent; + parent = tmp; + // right-rotate on node + setRight(grandparent, parent); + setLeft(node, getRight(parent)); + setRight(parent, node); + } + // case 2.2 and 2.3 + setRed(parent, false); + setRed(grandparent, true); + // left-rotate on grandparent + if (greatGrandparent == NULL) { + root = parent; + } else if (getRight(greatGrandparent) == grandparent) { + setRight(greatGrandparent, parent); + } else { + setLeft(greatGrandparent, parent); + } + setRight(grandparent, getLeft(parent)); + setLeft(parent, grandparent); + return false; + } + } + } else { + return true; + } + } + + /** + * Add the new key to the tree. + * @return true if the element is a new one. + */ + protected boolean add() { + add(root, false, NULL, NULL, NULL); + if (wasAdd) { + setRed(root, false); + return true; + } else { + return false; + } + } + + /** + * Get the number of elements in the set. + */ + public int size() { + return size; + } + + /** + * Reset the table to empty. + */ + public void clear() { + root = NULL; + size = 0; + data.clear(); + } + + /** + * Get the buffer size in bytes. + */ + public long getSizeInBytes() { + return data.getSizeInBytes(); + } +} + diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthByteWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthByteWriter.java new file mode 100644 index 0000000000..0953cdd2a1 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthByteWriter.java @@ -0,0 +1,106 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import java.io.IOException; + +/** + * A streamFactory that writes a sequence of bytes. A control byte is written before + * each run with positive values 0 to 127 meaning 2 to 129 repetitions. If the + * bytes is -1 to -128, 1 to 128 literal byte values follow. + */ +class RunLengthByteWriter { + static final int MIN_REPEAT_SIZE = 3; + static final int MAX_LITERAL_SIZE = 128; + static final int MAX_REPEAT_SIZE= 127 + MIN_REPEAT_SIZE; + private final PositionedOutputStream output; + private final byte[] literals = new byte[MAX_LITERAL_SIZE]; + private int numLiterals = 0; + private boolean repeat = false; + private int tailRunLength = 0; + + RunLengthByteWriter(PositionedOutputStream output) { + this.output = output; + } + + private void writeValues() throws IOException { + if (numLiterals != 0) { + if (repeat) { + output.write(numLiterals - MIN_REPEAT_SIZE); + output.write(literals, 0, 1); + } else { + output.write(-numLiterals); + output.write(literals, 0, numLiterals); + } + repeat = false; + tailRunLength = 0; + numLiterals = 0; + } + } + + void flush() throws IOException { + writeValues(); + output.flush(); + } + + void write(byte value) throws IOException { + if (numLiterals == 0) { + literals[numLiterals++] = value; + tailRunLength = 1; + } else if (repeat) { + if (value == literals[0]) { + numLiterals += 1; + if (numLiterals == MAX_REPEAT_SIZE) { + writeValues(); + } + } else { + writeValues(); + literals[numLiterals++] = value; + tailRunLength = 1; + } + } else { + if (value == literals[numLiterals - 1]) { + tailRunLength += 1; + } else { + tailRunLength = 1; + } + if (tailRunLength == MIN_REPEAT_SIZE) { + if (numLiterals + 1 == MIN_REPEAT_SIZE) { + repeat = true; + numLiterals += 1; + } else { + numLiterals -= MIN_REPEAT_SIZE - 1; + writeValues(); + literals[0] = value; + repeat = true; + numLiterals = MIN_REPEAT_SIZE; + } + } else { + literals[numLiterals++] = value; + if (numLiterals == MAX_LITERAL_SIZE) { + writeValues(); + } + } + } + } + + void getPosition(PositionRecorder recorder) throws IOException { + output.getPosition(recorder); + recorder.addPosition(numLiterals); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriter.java new file mode 100644 index 0000000000..867f041912 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriter.java @@ -0,0 +1,143 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import java.io.IOException; + +/** + * A streamFactory that writes a sequence of integers. A control byte is written before + * each run with positive values 0 to 127 meaning 3 to 130 repetitions, each + * repetition is offset by a delta. If the control byte is -1 to -128, 1 to 128 + * literal vint values follow. + */ +class RunLengthIntegerWriter implements IntegerWriter { + static final int MIN_REPEAT_SIZE = 3; + static final int MAX_DELTA = 127; + static final int MIN_DELTA = -128; + static final int MAX_LITERAL_SIZE = 128; + private static final int MAX_REPEAT_SIZE = 127 + MIN_REPEAT_SIZE; + private final PositionedOutputStream output; + private final boolean signed; + private final long[] literals = new long[MAX_LITERAL_SIZE]; + private int numLiterals = 0; + private long delta = 0; + private boolean repeat = false; + private int tailRunLength = 0; + private SerializationUtils utils; + + RunLengthIntegerWriter(PositionedOutputStream output, + boolean signed) { + this.output = output; + this.signed = signed; + this.utils = new SerializationUtils(); + } + + private void writeValues() throws IOException { + if (numLiterals != 0) { + if (repeat) { + output.write(numLiterals - MIN_REPEAT_SIZE); + output.write((byte) delta); + if (signed) { + utils.writeVslong(output, literals[0]); + } else { + utils.writeVulong(output, literals[0]); + } + } else { + output.write(-numLiterals); + for(int i=0; i < numLiterals; ++i) { + if (signed) { + utils.writeVslong(output, literals[i]); + } else { + utils.writeVulong(output, literals[i]); + } + } + } + repeat = false; + numLiterals = 0; + tailRunLength = 0; + } + } + + @Override + public void flush() throws IOException { + writeValues(); + output.flush(); + } + + @Override + public void write(long value) throws IOException { + if (numLiterals == 0) { + literals[numLiterals++] = value; + tailRunLength = 1; + } else if (repeat) { + if (value == literals[0] + delta * numLiterals) { + numLiterals += 1; + if (numLiterals == MAX_REPEAT_SIZE) { + writeValues(); + } + } else { + writeValues(); + literals[numLiterals++] = value; + tailRunLength = 1; + } + } else { + if (tailRunLength == 1) { + delta = value - literals[numLiterals - 1]; + if (delta < MIN_DELTA || delta > MAX_DELTA) { + tailRunLength = 1; + } else { + tailRunLength = 2; + } + } else if (value == literals[numLiterals - 1] + delta) { + tailRunLength += 1; + } else { + delta = value - literals[numLiterals - 1]; + if (delta < MIN_DELTA || delta > MAX_DELTA) { + tailRunLength = 1; + } else { + tailRunLength = 2; + } + } + if (tailRunLength == MIN_REPEAT_SIZE) { + if (numLiterals + 1 == MIN_REPEAT_SIZE) { + repeat = true; + numLiterals += 1; + } else { + numLiterals -= MIN_REPEAT_SIZE - 1; + long base = literals[numLiterals]; + writeValues(); + literals[0] = base; + repeat = true; + numLiterals = MIN_REPEAT_SIZE; + } + } else { + literals[numLiterals++] = value; + if (numLiterals == MAX_LITERAL_SIZE) { + writeValues(); + } + } + } + } + + @Override + public void getPosition(PositionRecorder recorder) throws IOException { + output.getPosition(recorder); + recorder.addPosition(numLiterals); + } + +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriterV2.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriterV2.java new file mode 100644 index 0000000000..7237b2e29d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/RunLengthIntegerWriterV2.java @@ -0,0 +1,832 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import java.io.IOException; + +/** + * A writer that performs light weight compression over sequence of integers. + *

+ * There are four types of lightweight integer compression + *

    + *
  • SHORT_REPEAT
  • + *
  • DIRECT
  • + *
  • PATCHED_BASE
  • + *
  • DELTA
  • + *
+ *

+ * The description and format for these types are as below: + *

+ * SHORT_REPEAT: Used for short repeated integer sequences. + *

    + *
  • 1 byte header + *
      + *
    • 2 bits for encoding type
    • + *
    • 3 bits for bytes required for repeating value
    • + *
    • 3 bits for repeat count (MIN_REPEAT + run length)
    • + *
    + *
  • + *
  • Blob - repeat value (fixed bytes)
  • + *
+ *

+ *

+ * DIRECT: Used for random integer sequences whose number of bit + * requirement doesn't vary a lot. + *

    + *
  • 2 bytes header + *
      + * 1st byte + *
    • 2 bits for encoding type
    • + *
    • 5 bits for fixed bit width of values in blob
    • + *
    • 1 bit for storing MSB of run length
    • + *
    + *
      + * 2nd byte + *
    • 8 bits for lower run length bits
    • + *
    + *
  • + *
  • Blob - stores the direct values using fixed bit width. The length of the + * data blob is (fixed width * run length) bits long
  • + *
+ *

+ *

+ * PATCHED_BASE: Used for random integer sequences whose number of bit + * requirement varies beyond a threshold. + *

    + *
  • 4 bytes header + *
      + * 1st byte + *
    • 2 bits for encoding type
    • + *
    • 5 bits for fixed bit width of values in blob
    • + *
    • 1 bit for storing MSB of run length
    • + *
    + *
      + * 2nd byte + *
    • 8 bits for lower run length bits
    • + *
    + *
      + * 3rd byte + *
    • 3 bits for bytes required to encode base value
    • + *
    • 5 bits for patch width
    • + *
    + *
      + * 4th byte + *
    • 3 bits for patch gap width
    • + *
    • 5 bits for patch length
    • + *
    + *
  • + *
  • Base value - Stored using fixed number of bytes. If MSB is set, base + * value is negative else positive. Length of base value is (base width * 8) + * bits.
  • + *
  • Data blob - Base reduced values as stored using fixed bit width. Length + * of data blob is (fixed width * run length) bits.
  • + *
  • Patch blob - Patch blob is a list of gap and patch value. Each entry in + * the patch list is (patch width + patch gap width) bits long. Gap between the + * subsequent elements to be patched are stored in upper part of entry whereas + * patch values are stored in lower part of entry. Length of patch blob is + * ((patch width + patch gap width) * patch length) bits.
  • + *
+ *

+ *

+ * DELTA Used for monotonically increasing or decreasing sequences, + * sequences with fixed delta values or long repeated sequences. + *

    + *
  • 2 bytes header + *
      + * 1st byte + *
    • 2 bits for encoding type
    • + *
    • 5 bits for fixed bit width of values in blob
    • + *
    • 1 bit for storing MSB of run length
    • + *
    + *
      + * 2nd byte + *
    • 8 bits for lower run length bits
    • + *
    + *
  • + *
  • Base value - encoded as varint
  • + *
  • Delta base - encoded as varint
  • + *
  • Delta blob - only positive values. monotonicity and orderness are decided + * based on the sign of the base value and delta base
  • + *
+ *

+ */ +class RunLengthIntegerWriterV2 implements IntegerWriter { + + public enum EncodingType { + SHORT_REPEAT, DIRECT, PATCHED_BASE, DELTA + } + + static final int MAX_SCOPE = 512; + static final int MIN_REPEAT = 3; + private static final int MAX_SHORT_REPEAT_LENGTH = 10; + private long prevDelta = 0; + private int fixedRunLength = 0; + private int variableRunLength = 0; + private final long[] literals = new long[MAX_SCOPE]; + private final PositionedOutputStream output; + private final boolean signed; + private EncodingType encoding; + private int numLiterals; + private final long[] zigzagLiterals = new long[MAX_SCOPE]; + private final long[] baseRedLiterals = new long[MAX_SCOPE]; + private final long[] adjDeltas = new long[MAX_SCOPE]; + private long fixedDelta; + private int zzBits90p; + private int zzBits100p; + private int brBits95p; + private int brBits100p; + private int bitsDeltaMax; + private int patchWidth; + private int patchGapWidth; + private int patchLength; + private long[] gapVsPatchList; + private long min; + private boolean isFixedDelta; + private SerializationUtils utils; + private boolean alignedBitpacking; + + RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed) { + this(output, signed, true); + } + + RunLengthIntegerWriterV2(PositionedOutputStream output, boolean signed, + boolean alignedBitpacking) { + this.output = output; + this.signed = signed; + this.alignedBitpacking = alignedBitpacking; + this.utils = new SerializationUtils(); + clear(); + } + + private void writeValues() throws IOException { + if (numLiterals != 0) { + + if (encoding.equals(EncodingType.SHORT_REPEAT)) { + writeShortRepeatValues(); + } else if (encoding.equals(EncodingType.DIRECT)) { + writeDirectValues(); + } else if (encoding.equals(EncodingType.PATCHED_BASE)) { + writePatchedBaseValues(); + } else { + writeDeltaValues(); + } + + // clear all the variables + clear(); + } + } + + private void writeDeltaValues() throws IOException { + int len = 0; + int fb = bitsDeltaMax; + int efb = 0; + + if (alignedBitpacking) { + fb = utils.getClosestAlignedFixedBits(fb); + } + + if (isFixedDelta) { + // if fixed run length is greater than threshold then it will be fixed + // delta sequence with delta value 0 else fixed delta sequence with + // non-zero delta value + if (fixedRunLength > MIN_REPEAT) { + // ex. sequence: 2 2 2 2 2 2 2 2 + len = fixedRunLength - 1; + fixedRunLength = 0; + } else { + // ex. sequence: 4 6 8 10 12 14 16 + len = variableRunLength - 1; + variableRunLength = 0; + } + } else { + // fixed width 0 is used for long repeating values. + // sequences that require only 1 bit to encode will have an additional bit + if (fb == 1) { + fb = 2; + } + efb = utils.encodeBitWidth(fb); + efb = efb << 1; + len = variableRunLength - 1; + variableRunLength = 0; + } + + // extract the 9th bit of run length + final int tailBits = (len & 0x100) >>> 8; + + // create first byte of the header + final int headerFirstByte = getOpcode() | efb | tailBits; + + // second byte of the header stores the remaining 8 bits of runlength + final int headerSecondByte = len & 0xff; + + // write header + output.write(headerFirstByte); + output.write(headerSecondByte); + + // store the first value from zigzag literal array + if (signed) { + utils.writeVslong(output, literals[0]); + } else { + utils.writeVulong(output, literals[0]); + } + + if (isFixedDelta) { + // if delta is fixed then we don't need to store delta blob + utils.writeVslong(output, fixedDelta); + } else { + // store the first value as delta value using zigzag encoding + utils.writeVslong(output, adjDeltas[0]); + + // adjacent delta values are bit packed. The length of adjDeltas array is + // always one less than the number of literals (delta difference for n + // elements is n-1). We have already written one element, write the + // remaining numLiterals - 2 elements here + utils.writeInts(adjDeltas, 1, numLiterals - 2, fb, output); + } + } + + private void writePatchedBaseValues() throws IOException { + + // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding + // because patch is applied to MSB bits. For example: If fixed bit width of + // base value is 7 bits and if patch is 3 bits, the actual value is + // constructed by shifting the patch to left by 7 positions. + // actual_value = patch << 7 | base_value + // So, if we align base_value then actual_value can not be reconstructed. + + // write the number of fixed bits required in next 5 bits + final int fb = brBits95p; + final int efb = utils.encodeBitWidth(fb) << 1; + + // adjust variable run length, they are one off + variableRunLength -= 1; + + // extract the 9th bit of run length + final int tailBits = (variableRunLength & 0x100) >>> 8; + + // create first byte of the header + final int headerFirstByte = getOpcode() | efb | tailBits; + + // second byte of the header stores the remaining 8 bits of runlength + final int headerSecondByte = variableRunLength & 0xff; + + // if the min value is negative toggle the sign + final boolean isNegative = min < 0 ? true : false; + if (isNegative) { + min = -min; + } + + // find the number of bytes required for base and shift it by 5 bits + // to accommodate patch width. The additional bit is used to store the sign + // of the base value. + final int baseWidth = utils.findClosestNumBits(min) + 1; + final int baseBytes = baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1; + final int bb = (baseBytes - 1) << 5; + + // if the base value is negative then set MSB to 1 + if (isNegative) { + min |= (1L << ((baseBytes * 8) - 1)); + } + + // third byte contains 3 bits for number of bytes occupied by base + // and 5 bits for patchWidth + final int headerThirdByte = bb | utils.encodeBitWidth(patchWidth); + + // fourth byte contains 3 bits for page gap width and 5 bits for + // patch length + final int headerFourthByte = (patchGapWidth - 1) << 5 | patchLength; + + // write header + output.write(headerFirstByte); + output.write(headerSecondByte); + output.write(headerThirdByte); + output.write(headerFourthByte); + + // write the base value using fixed bytes in big endian order + for(int i = baseBytes - 1; i >= 0; i--) { + byte b = (byte) ((min >>> (i * 8)) & 0xff); + output.write(b); + } + + // base reduced literals are bit packed + int closestFixedBits = utils.getClosestFixedBits(fb); + + utils.writeInts(baseRedLiterals, 0, numLiterals, closestFixedBits, + output); + + // write patch list + closestFixedBits = utils.getClosestFixedBits(patchGapWidth + patchWidth); + + utils.writeInts(gapVsPatchList, 0, gapVsPatchList.length, closestFixedBits, + output); + + // reset run length + variableRunLength = 0; + } + + /** + * Store the opcode in 2 MSB bits + * @return opcode + */ + private int getOpcode() { + return encoding.ordinal() << 6; + } + + private void writeDirectValues() throws IOException { + + // write the number of fixed bits required in next 5 bits + int fb = zzBits100p; + + if (alignedBitpacking) { + fb = utils.getClosestAlignedFixedBits(fb); + } + + final int efb = utils.encodeBitWidth(fb) << 1; + + // adjust variable run length + variableRunLength -= 1; + + // extract the 9th bit of run length + final int tailBits = (variableRunLength & 0x100) >>> 8; + + // create first byte of the header + final int headerFirstByte = getOpcode() | efb | tailBits; + + // second byte of the header stores the remaining 8 bits of runlength + final int headerSecondByte = variableRunLength & 0xff; + + // write header + output.write(headerFirstByte); + output.write(headerSecondByte); + + // bit packing the zigzag encoded literals + utils.writeInts(zigzagLiterals, 0, numLiterals, fb, output); + + // reset run length + variableRunLength = 0; + } + + private void writeShortRepeatValues() throws IOException { + // get the value that is repeating, compute the bits and bytes required + long repeatVal = 0; + if (signed) { + repeatVal = utils.zigzagEncode(literals[0]); + } else { + repeatVal = literals[0]; + } + + final int numBitsRepeatVal = utils.findClosestNumBits(repeatVal); + final int numBytesRepeatVal = numBitsRepeatVal % 8 == 0 ? numBitsRepeatVal >>> 3 + : (numBitsRepeatVal >>> 3) + 1; + + // write encoding type in top 2 bits + int header = getOpcode(); + + // write the number of bytes required for the value + header |= ((numBytesRepeatVal - 1) << 3); + + // write the run length + fixedRunLength -= MIN_REPEAT; + header |= fixedRunLength; + + // write the header + output.write(header); + + // write the repeating value in big endian byte order + for(int i = numBytesRepeatVal - 1; i >= 0; i--) { + int b = (int) ((repeatVal >>> (i * 8)) & 0xff); + output.write(b); + } + + fixedRunLength = 0; + } + + private void determineEncoding() { + + // we need to compute zigzag values for DIRECT encoding if we decide to + // break early for delta overflows or for shorter runs + computeZigZagLiterals(); + + zzBits100p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 1.0); + + // not a big win for shorter runs to determine encoding + if (numLiterals <= MIN_REPEAT) { + encoding = EncodingType.DIRECT; + return; + } + + // DELTA encoding check + + // for identifying monotonic sequences + boolean isIncreasing = true; + boolean isDecreasing = true; + this.isFixedDelta = true; + + this.min = literals[0]; + long max = literals[0]; + final long initialDelta = literals[1] - literals[0]; + long currDelta = initialDelta; + long deltaMax = initialDelta; + this.adjDeltas[0] = initialDelta; + + for (int i = 1; i < numLiterals; i++) { + final long l1 = literals[i]; + final long l0 = literals[i - 1]; + currDelta = l1 - l0; + min = Math.min(min, l1); + max = Math.max(max, l1); + + isIncreasing &= (l0 <= l1); + isDecreasing &= (l0 >= l1); + + isFixedDelta &= (currDelta == initialDelta); + if (i > 1) { + adjDeltas[i - 1] = Math.abs(currDelta); + deltaMax = Math.max(deltaMax, adjDeltas[i - 1]); + } + } + + // its faster to exit under delta overflow condition without checking for + // PATCHED_BASE condition as encoding using DIRECT is faster and has less + // overhead than PATCHED_BASE + if (!utils.isSafeSubtract(max, min)) { + encoding = EncodingType.DIRECT; + return; + } + + // invariant - subtracting any number from any other in the literals after + // this point won't overflow + + // if initialDelta is 0 then we cannot delta encode as we cannot identify + // the sign of deltas (increasing or decreasing) + if (initialDelta != 0) { + + // if min is equal to max then the delta is 0, this condition happens for + // fixed values run >10 which cannot be encoded with SHORT_REPEAT + if (min == max) { + assert isFixedDelta : min + "==" + max + + ", isFixedDelta cannot be false"; + assert currDelta == 0 : min + "==" + max + ", currDelta should be zero"; + fixedDelta = 0; + encoding = EncodingType.DELTA; + return; + } + + if (isFixedDelta) { + assert currDelta == initialDelta + : "currDelta should be equal to initialDelta for fixed delta encoding"; + encoding = EncodingType.DELTA; + fixedDelta = currDelta; + return; + } + + // stores the number of bits required for packing delta blob in + // delta encoding + bitsDeltaMax = utils.findClosestNumBits(deltaMax); + + // monotonic condition + if (isIncreasing || isDecreasing) { + encoding = EncodingType.DELTA; + return; + } + } + + // PATCHED_BASE encoding check + + // percentile values are computed for the zigzag encoded values. if the + // number of bit requirement between 90th and 100th percentile varies + // beyond a threshold then we need to patch the values. if the variation + // is not significant then we can use direct encoding + + zzBits90p = utils.percentileBits(zigzagLiterals, 0, numLiterals, 0.9); + int diffBitsLH = zzBits100p - zzBits90p; + + // if the difference between 90th percentile and 100th percentile fixed + // bits is > 1 then we need patch the values + if (diffBitsLH > 1) { + + // patching is done only on base reduced values. + // remove base from literals + for (int i = 0; i < numLiterals; i++) { + baseRedLiterals[i] = literals[i] - min; + } + + // 95th percentile width is used to determine max allowed value + // after which patching will be done + brBits95p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 0.95); + + // 100th percentile is used to compute the max patch width + brBits100p = utils.percentileBits(baseRedLiterals, 0, numLiterals, 1.0); + + // after base reducing the values, if the difference in bits between + // 95th percentile and 100th percentile value is zero then there + // is no point in patching the values, in which case we will + // fallback to DIRECT encoding. + // The decision to use patched base was based on zigzag values, but the + // actual patching is done on base reduced literals. + if ((brBits100p - brBits95p) != 0) { + encoding = EncodingType.PATCHED_BASE; + preparePatchedBlob(); + return; + } else { + encoding = EncodingType.DIRECT; + return; + } + } else { + // if difference in bits between 95th percentile and 100th percentile is + // 0, then patch length will become 0. Hence we will fallback to direct + encoding = EncodingType.DIRECT; + return; + } + } + + private void computeZigZagLiterals() { + // populate zigzag encoded literals + long zzEncVal = 0; + for (int i = 0; i < numLiterals; i++) { + if (signed) { + zzEncVal = utils.zigzagEncode(literals[i]); + } else { + zzEncVal = literals[i]; + } + zigzagLiterals[i] = zzEncVal; + } + } + + private void preparePatchedBlob() { + // mask will be max value beyond which patch will be generated + long mask = (1L << brBits95p) - 1; + + // since we are considering only 95 percentile, the size of gap and + // patch array can contain only be 5% values + patchLength = (int) Math.ceil((numLiterals * 0.05)); + + int[] gapList = new int[patchLength]; + long[] patchList = new long[patchLength]; + + // #bit for patch + patchWidth = brBits100p - brBits95p; + patchWidth = utils.getClosestFixedBits(patchWidth); + + // if patch bit requirement is 64 then it will not possible to pack + // gap and patch together in a long. To make sure gap and patch can be + // packed together adjust the patch width + if (patchWidth == 64) { + patchWidth = 56; + brBits95p = 8; + mask = (1L << brBits95p) - 1; + } + + int gapIdx = 0; + int patchIdx = 0; + int prev = 0; + int gap = 0; + int maxGap = 0; + + for(int i = 0; i < numLiterals; i++) { + // if value is above mask then create the patch and record the gap + if (baseRedLiterals[i] > mask) { + gap = i - prev; + if (gap > maxGap) { + maxGap = gap; + } + + // gaps are relative, so store the previous patched value index + prev = i; + gapList[gapIdx++] = gap; + + // extract the most significant bits that are over mask bits + long patch = baseRedLiterals[i] >>> brBits95p; + patchList[patchIdx++] = patch; + + // strip off the MSB to enable safe bit packing + baseRedLiterals[i] &= mask; + } + } + + // adjust the patch length to number of entries in gap list + patchLength = gapIdx; + + // if the element to be patched is the first and only element then + // max gap will be 0, but to store the gap as 0 we need atleast 1 bit + if (maxGap == 0 && patchLength != 0) { + patchGapWidth = 1; + } else { + patchGapWidth = utils.findClosestNumBits(maxGap); + } + + // special case: if the patch gap width is greater than 256, then + // we need 9 bits to encode the gap width. But we only have 3 bits in + // header to record the gap width. To deal with this case, we will save + // two entries in patch list in the following way + // 256 gap width => 0 for patch value + // actual gap - 256 => actual patch value + // We will do the same for gap width = 511. If the element to be patched is + // the last element in the scope then gap width will be 511. In this case we + // will have 3 entries in the patch list in the following way + // 255 gap width => 0 for patch value + // 255 gap width => 0 for patch value + // 1 gap width => actual patch value + if (patchGapWidth > 8) { + patchGapWidth = 8; + // for gap = 511, we need two additional entries in patch list + if (maxGap == 511) { + patchLength += 2; + } else { + patchLength += 1; + } + } + + // create gap vs patch list + gapIdx = 0; + patchIdx = 0; + gapVsPatchList = new long[patchLength]; + for(int i = 0; i < patchLength; i++) { + long g = gapList[gapIdx++]; + long p = patchList[patchIdx++]; + while (g > 255) { + gapVsPatchList[i++] = (255L << patchWidth); + g -= 255; + } + + // store patch value in LSBs and gap in MSBs + gapVsPatchList[i] = (g << patchWidth) | p; + } + } + + /** + * clears all the variables + */ + private void clear() { + numLiterals = 0; + encoding = null; + prevDelta = 0; + fixedDelta = 0; + zzBits90p = 0; + zzBits100p = 0; + brBits95p = 0; + brBits100p = 0; + bitsDeltaMax = 0; + patchGapWidth = 0; + patchLength = 0; + patchWidth = 0; + gapVsPatchList = null; + min = 0; + isFixedDelta = true; + } + + @Override + public void flush() throws IOException { + if (numLiterals != 0) { + if (variableRunLength != 0) { + determineEncoding(); + writeValues(); + } else if (fixedRunLength != 0) { + if (fixedRunLength < MIN_REPEAT) { + variableRunLength = fixedRunLength; + fixedRunLength = 0; + determineEncoding(); + writeValues(); + } else if (fixedRunLength >= MIN_REPEAT + && fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { + encoding = EncodingType.SHORT_REPEAT; + writeValues(); + } else { + encoding = EncodingType.DELTA; + isFixedDelta = true; + writeValues(); + } + } + } + output.flush(); + } + + @Override + public void write(long val) throws IOException { + if (numLiterals == 0) { + initializeLiterals(val); + } else { + if (numLiterals == 1) { + prevDelta = val - literals[0]; + literals[numLiterals++] = val; + // if both values are same count as fixed run else variable run + if (val == literals[0]) { + fixedRunLength = 2; + variableRunLength = 0; + } else { + fixedRunLength = 0; + variableRunLength = 2; + } + } else { + long currentDelta = val - literals[numLiterals - 1]; + if (prevDelta == 0 && currentDelta == 0) { + // fixed delta run + + literals[numLiterals++] = val; + + // if variable run is non-zero then we are seeing repeating + // values at the end of variable run in which case keep + // updating variable and fixed runs + if (variableRunLength > 0) { + fixedRunLength = 2; + } + fixedRunLength += 1; + + // if fixed run met the minimum condition and if variable + // run is non-zero then flush the variable run and shift the + // tail fixed runs to start of the buffer + if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) { + numLiterals -= MIN_REPEAT; + variableRunLength -= MIN_REPEAT - 1; + // copy the tail fixed runs + long[] tailVals = new long[MIN_REPEAT]; + System.arraycopy(literals, numLiterals, tailVals, 0, MIN_REPEAT); + + // determine variable encoding and flush values + determineEncoding(); + writeValues(); + + // shift tail fixed runs to beginning of the buffer + for(long l : tailVals) { + literals[numLiterals++] = l; + } + } + + // if fixed runs reached max repeat length then write values + if (fixedRunLength == MAX_SCOPE) { + determineEncoding(); + writeValues(); + } + } else { + // variable delta run + + // if fixed run length is non-zero and if it satisfies the + // short repeat conditions then write the values as short repeats + // else use delta encoding + if (fixedRunLength >= MIN_REPEAT) { + if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) { + encoding = EncodingType.SHORT_REPEAT; + writeValues(); + } else { + encoding = EncodingType.DELTA; + isFixedDelta = true; + writeValues(); + } + } + + // if fixed run length is 0 && fixedRunLength < MIN_REPEAT) { + if (val != literals[numLiterals - 1]) { + variableRunLength = fixedRunLength; + fixedRunLength = 0; + } + } + + // after writing values re-initialize the variables + if (numLiterals == 0) { + initializeLiterals(val); + } else { + // keep updating variable run lengths + prevDelta = val - literals[numLiterals - 1]; + literals[numLiterals++] = val; + variableRunLength += 1; + + // if variable run length reach the max scope, write it + if (variableRunLength == MAX_SCOPE) { + determineEncoding(); + writeValues(); + } + } + } + } + } + } + + private void initializeLiterals(long val) { + literals[numLiterals++] = val; + fixedRunLength = 1; + variableRunLength = 1; + } + + @Override + public void getPosition(PositionRecorder recorder) throws IOException { + output.getPosition(recorder); + recorder.addPosition(numLiterals); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java new file mode 100644 index 0000000000..53687b7fdb --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SerializationUtils.java @@ -0,0 +1,844 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import java.io.EOFException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.math.BigInteger; + +final class SerializationUtils { + + private final static int BUFFER_SIZE = 64; + private final byte[] readBuffer; + private final byte[] writeBuffer; + + public SerializationUtils() { + this.readBuffer = new byte[BUFFER_SIZE]; + this.writeBuffer = new byte[BUFFER_SIZE]; + } + + void writeVulong(OutputStream output, long value) throws IOException { + while (true) { + if ((value & ~0x7f) == 0) { + output.write((byte) value); + return; + } else { + output.write((byte) (0x80 | (value & 0x7f))); + value >>>= 7; + } + } + } + + void writeVslong(OutputStream output, long value) throws IOException { + writeVulong(output, (value << 1) ^ (value >> 63)); + } + + + long readVulong(InputStream in) throws IOException { + long result = 0; + long b; + int offset = 0; + do { + b = in.read(); + if (b == -1) { + throw new EOFException("Reading Vulong past EOF"); + } + result |= (0x7f & b) << offset; + offset += 7; + } while (b >= 0x80); + return result; + } + + long readVslong(InputStream in) throws IOException { + long result = readVulong(in); + return (result >>> 1) ^ -(result & 1); + } + + float readFloat(InputStream in) throws IOException { + int ser = in.read() | (in.read() << 8) | (in.read() << 16) | + (in.read() << 24); + return Float.intBitsToFloat(ser); + } + + void writeFloat(OutputStream output, float value) throws IOException { + int ser = Float.floatToIntBits(value); + output.write(ser & 0xff); + output.write((ser >> 8) & 0xff); + output.write((ser >> 16) & 0xff); + output.write((ser >> 24) & 0xff); + } + + double readDouble(InputStream in) throws IOException { + return Double.longBitsToDouble(readLongLE(in)); + } + + long readLongLE(InputStream in) throws IOException { + in.read(readBuffer, 0, 8); + return (((readBuffer[0] & 0xff) << 0) + + ((readBuffer[1] & 0xff) << 8) + + ((readBuffer[2] & 0xff) << 16) + + ((long) (readBuffer[3] & 0xff) << 24) + + ((long) (readBuffer[4] & 0xff) << 32) + + ((long) (readBuffer[5] & 0xff) << 40) + + ((long) (readBuffer[6] & 0xff) << 48) + + ((long) (readBuffer[7] & 0xff) << 56)); + } + + void writeDouble(OutputStream output, double value) throws IOException { + writeLongLE(output, Double.doubleToLongBits(value)); + } + + private void writeLongLE(OutputStream output, long value) throws IOException { + writeBuffer[0] = (byte) ((value >> 0) & 0xff); + writeBuffer[1] = (byte) ((value >> 8) & 0xff); + writeBuffer[2] = (byte) ((value >> 16) & 0xff); + writeBuffer[3] = (byte) ((value >> 24) & 0xff); + writeBuffer[4] = (byte) ((value >> 32) & 0xff); + writeBuffer[5] = (byte) ((value >> 40) & 0xff); + writeBuffer[6] = (byte) ((value >> 48) & 0xff); + writeBuffer[7] = (byte) ((value >> 56) & 0xff); + output.write(writeBuffer, 0, 8); + } + + /** + * Write the arbitrarily sized signed BigInteger in vint format. + * + * Signed integers are encoded using the low bit as the sign bit using zigzag + * encoding. + * + * Each byte uses the low 7 bits for data and the high bit for stop/continue. + * + * Bytes are stored LSB first. + * @param output the stream to write to + * @param value the value to output + * @throws IOException + */ + static void writeBigInteger(OutputStream output, + BigInteger value) throws IOException { + // encode the signed number as a positive integer + value = value.shiftLeft(1); + int sign = value.signum(); + if (sign < 0) { + value = value.negate(); + value = value.subtract(BigInteger.ONE); + } + int length = value.bitLength(); + while (true) { + long lowBits = value.longValue() & 0x7fffffffffffffffL; + length -= 63; + // write out the next 63 bits worth of data + for(int i=0; i < 9; ++i) { + // if this is the last byte, leave the high bit off + if (length <= 0 && (lowBits & ~0x7f) == 0) { + output.write((byte) lowBits); + return; + } else { + output.write((byte) (0x80 | (lowBits & 0x7f))); + lowBits >>>= 7; + } + } + value = value.shiftRight(63); + } + } + + /** + * Read the signed arbitrary sized BigInteger BigInteger in vint format + * @param input the stream to read from + * @return the read BigInteger + * @throws IOException + */ + static BigInteger readBigInteger(InputStream input) throws IOException { + BigInteger result = BigInteger.ZERO; + long work = 0; + int offset = 0; + long b; + do { + b = input.read(); + if (b == -1) { + throw new EOFException("Reading BigInteger past EOF from " + input); + } + work |= (0x7f & b) << (offset % 63); + offset += 7; + // if we've read 63 bits, roll them into the result + if (offset == 63) { + result = BigInteger.valueOf(work); + work = 0; + } else if (offset % 63 == 0) { + result = result.or(BigInteger.valueOf(work).shiftLeft(offset-63)); + work = 0; + } + } while (b >= 0x80); + if (work != 0) { + result = result.or(BigInteger.valueOf(work).shiftLeft((offset/63)*63)); + } + // convert back to a signed number + boolean isNegative = result.testBit(0); + if (isNegative) { + result = result.add(BigInteger.ONE); + result = result.negate(); + } + result = result.shiftRight(1); + return result; + } + + enum FixedBitSizes { + ONE, TWO, THREE, FOUR, FIVE, SIX, SEVEN, EIGHT, NINE, TEN, ELEVEN, TWELVE, + THIRTEEN, FOURTEEN, FIFTEEN, SIXTEEN, SEVENTEEN, EIGHTEEN, NINETEEN, + TWENTY, TWENTYONE, TWENTYTWO, TWENTYTHREE, TWENTYFOUR, TWENTYSIX, + TWENTYEIGHT, THIRTY, THIRTYTWO, FORTY, FORTYEIGHT, FIFTYSIX, SIXTYFOUR; + } + + /** + * Count the number of bits required to encode the given value + * @param value + * @return bits required to store value + */ + int findClosestNumBits(long value) { + int count = 0; + while (value != 0) { + count++; + value = value >>> 1; + } + return getClosestFixedBits(count); + } + + /** + * zigzag encode the given value + * @param val + * @return zigzag encoded value + */ + long zigzagEncode(long val) { + return (val << 1) ^ (val >> 63); + } + + /** + * zigzag decode the given value + * @param val + * @return zizag decoded value + */ + long zigzagDecode(long val) { + return (val >>> 1) ^ -(val & 1); + } + + /** + * Compute the bits required to represent pth percentile value + * @param data - array + * @param p - percentile value (>=0.0 to <=1.0) + * @return pth percentile bits + */ + int percentileBits(long[] data, int offset, int length, double p) { + if ((p > 1.0) || (p <= 0.0)) { + return -1; + } + + // histogram that store the encoded bit requirement for each values. + // maximum number of bits that can encoded is 32 (refer FixedBitSizes) + int[] hist = new int[32]; + + // compute the histogram + for(int i = offset; i < (offset + length); i++) { + int idx = encodeBitWidth(findClosestNumBits(data[i])); + hist[idx] += 1; + } + + int perLen = (int) (length * (1.0 - p)); + + // return the bits required by pth percentile length + for(int i = hist.length - 1; i >= 0; i--) { + perLen -= hist[i]; + if (perLen < 0) { + return decodeBitWidth(i); + } + } + + return 0; + } + + /** + * Calculate the number of bytes required + * @param n - number of values + * @param numBits - bit width + * @return number of bytes required + */ + int getTotalBytesRequired(int n, int numBits) { + return (n * numBits + 7) / 8; + } + + /** + * For a given fixed bit this function will return the closest available fixed + * bit + * @param n + * @return closest valid fixed bit + */ + int getClosestFixedBits(int n) { + if (n == 0) { + return 1; + } + + if (n >= 1 && n <= 24) { + return n; + } else if (n > 24 && n <= 26) { + return 26; + } else if (n > 26 && n <= 28) { + return 28; + } else if (n > 28 && n <= 30) { + return 30; + } else if (n > 30 && n <= 32) { + return 32; + } else if (n > 32 && n <= 40) { + return 40; + } else if (n > 40 && n <= 48) { + return 48; + } else if (n > 48 && n <= 56) { + return 56; + } else { + return 64; + } + } + + public int getClosestAlignedFixedBits(int n) { + if (n == 0 || n == 1) { + return 1; + } else if (n > 1 && n <= 2) { + return 2; + } else if (n > 2 && n <= 4) { + return 4; + } else if (n > 4 && n <= 8) { + return 8; + } else if (n > 8 && n <= 16) { + return 16; + } else if (n > 16 && n <= 24) { + return 24; + } else if (n > 24 && n <= 32) { + return 32; + } else if (n > 32 && n <= 40) { + return 40; + } else if (n > 40 && n <= 48) { + return 48; + } else if (n > 48 && n <= 56) { + return 56; + } else { + return 64; + } + } + + /** + * Finds the closest available fixed bit width match and returns its encoded + * value (ordinal) + * @param n - fixed bit width to encode + * @return encoded fixed bit width + */ + int encodeBitWidth(int n) { + n = getClosestFixedBits(n); + + if (n >= 1 && n <= 24) { + return n - 1; + } else if (n > 24 && n <= 26) { + return FixedBitSizes.TWENTYSIX.ordinal(); + } else if (n > 26 && n <= 28) { + return FixedBitSizes.TWENTYEIGHT.ordinal(); + } else if (n > 28 && n <= 30) { + return FixedBitSizes.THIRTY.ordinal(); + } else if (n > 30 && n <= 32) { + return FixedBitSizes.THIRTYTWO.ordinal(); + } else if (n > 32 && n <= 40) { + return FixedBitSizes.FORTY.ordinal(); + } else if (n > 40 && n <= 48) { + return FixedBitSizes.FORTYEIGHT.ordinal(); + } else if (n > 48 && n <= 56) { + return FixedBitSizes.FIFTYSIX.ordinal(); + } else { + return FixedBitSizes.SIXTYFOUR.ordinal(); + } + } + + /** + * Decodes the ordinal fixed bit value to actual fixed bit width value + * @param n - encoded fixed bit width + * @return decoded fixed bit width + */ + int decodeBitWidth(int n) { + if (n >= FixedBitSizes.ONE.ordinal() + && n <= FixedBitSizes.TWENTYFOUR.ordinal()) { + return n + 1; + } else if (n == FixedBitSizes.TWENTYSIX.ordinal()) { + return 26; + } else if (n == FixedBitSizes.TWENTYEIGHT.ordinal()) { + return 28; + } else if (n == FixedBitSizes.THIRTY.ordinal()) { + return 30; + } else if (n == FixedBitSizes.THIRTYTWO.ordinal()) { + return 32; + } else if (n == FixedBitSizes.FORTY.ordinal()) { + return 40; + } else if (n == FixedBitSizes.FORTYEIGHT.ordinal()) { + return 48; + } else if (n == FixedBitSizes.FIFTYSIX.ordinal()) { + return 56; + } else { + return 64; + } + } + + /** + * Bitpack and write the input values to underlying output stream + * @param input - values to write + * @param offset - offset + * @param len - length + * @param bitSize - bit width + * @param output - output stream + * @throws IOException + */ + void writeInts(long[] input, int offset, int len, int bitSize, + OutputStream output) throws IOException { + if (input == null || input.length < 1 || offset < 0 || len < 1 + || bitSize < 1) { + return; + } + + switch (bitSize) { + case 1: + unrolledBitPack1(input, offset, len, output); + return; + case 2: + unrolledBitPack2(input, offset, len, output); + return; + case 4: + unrolledBitPack4(input, offset, len, output); + return; + case 8: + unrolledBitPack8(input, offset, len, output); + return; + case 16: + unrolledBitPack16(input, offset, len, output); + return; + case 24: + unrolledBitPack24(input, offset, len, output); + return; + case 32: + unrolledBitPack32(input, offset, len, output); + return; + case 40: + unrolledBitPack40(input, offset, len, output); + return; + case 48: + unrolledBitPack48(input, offset, len, output); + return; + case 56: + unrolledBitPack56(input, offset, len, output); + return; + case 64: + unrolledBitPack64(input, offset, len, output); + return; + default: + break; + } + + int bitsLeft = 8; + byte current = 0; + for(int i = offset; i < (offset + len); i++) { + long value = input[i]; + int bitsToWrite = bitSize; + while (bitsToWrite > bitsLeft) { + // add the bits to the bottom of the current word + current |= value >>> (bitsToWrite - bitsLeft); + // subtract out the bits we just added + bitsToWrite -= bitsLeft; + // zero out the bits above bitsToWrite + value &= (1L << bitsToWrite) - 1; + output.write(current); + current = 0; + bitsLeft = 8; + } + bitsLeft -= bitsToWrite; + current |= value << bitsLeft; + if (bitsLeft == 0) { + output.write(current); + current = 0; + bitsLeft = 8; + } + } + + // flush + if (bitsLeft != 8) { + output.write(current); + current = 0; + bitsLeft = 8; + } + } + + private void unrolledBitPack1(long[] input, int offset, int len, + OutputStream output) throws IOException { + final int numHops = 8; + final int remainder = len % numHops; + final int endOffset = offset + len; + final int endUnroll = endOffset - remainder; + int val = 0; + for (int i = offset; i < endUnroll; i = i + numHops) { + val = (int) (val | ((input[i] & 1) << 7) + | ((input[i + 1] & 1) << 6) + | ((input[i + 2] & 1) << 5) + | ((input[i + 3] & 1) << 4) + | ((input[i + 4] & 1) << 3) + | ((input[i + 5] & 1) << 2) + | ((input[i + 6] & 1) << 1) + | (input[i + 7]) & 1); + output.write(val); + val = 0; + } + + if (remainder > 0) { + int startShift = 7; + for (int i = endUnroll; i < endOffset; i++) { + val = (int) (val | (input[i] & 1) << startShift); + startShift -= 1; + } + output.write(val); + } + } + + private void unrolledBitPack2(long[] input, int offset, int len, + OutputStream output) throws IOException { + final int numHops = 4; + final int remainder = len % numHops; + final int endOffset = offset + len; + final int endUnroll = endOffset - remainder; + int val = 0; + for (int i = offset; i < endUnroll; i = i + numHops) { + val = (int) (val | ((input[i] & 3) << 6) + | ((input[i + 1] & 3) << 4) + | ((input[i + 2] & 3) << 2) + | (input[i + 3]) & 3); + output.write(val); + val = 0; + } + + if (remainder > 0) { + int startShift = 6; + for (int i = endUnroll; i < endOffset; i++) { + val = (int) (val | (input[i] & 3) << startShift); + startShift -= 2; + } + output.write(val); + } + } + + private void unrolledBitPack4(long[] input, int offset, int len, + OutputStream output) throws IOException { + final int numHops = 2; + final int remainder = len % numHops; + final int endOffset = offset + len; + final int endUnroll = endOffset - remainder; + int val = 0; + for (int i = offset; i < endUnroll; i = i + numHops) { + val = (int) (val | ((input[i] & 15) << 4) | (input[i + 1]) & 15); + output.write(val); + val = 0; + } + + if (remainder > 0) { + int startShift = 4; + for (int i = endUnroll; i < endOffset; i++) { + val = (int) (val | (input[i] & 15) << startShift); + startShift -= 4; + } + output.write(val); + } + } + + private void unrolledBitPack8(long[] input, int offset, int len, + OutputStream output) throws IOException { + unrolledBitPackBytes(input, offset, len, output, 1); + } + + private void unrolledBitPack16(long[] input, int offset, int len, + OutputStream output) throws IOException { + unrolledBitPackBytes(input, offset, len, output, 2); + } + + private void unrolledBitPack24(long[] input, int offset, int len, + OutputStream output) throws IOException { + unrolledBitPackBytes(input, offset, len, output, 3); + } + + private void unrolledBitPack32(long[] input, int offset, int len, + OutputStream output) throws IOException { + unrolledBitPackBytes(input, offset, len, output, 4); + } + + private void unrolledBitPack40(long[] input, int offset, int len, + OutputStream output) throws IOException { + unrolledBitPackBytes(input, offset, len, output, 5); + } + + private void unrolledBitPack48(long[] input, int offset, int len, + OutputStream output) throws IOException { + unrolledBitPackBytes(input, offset, len, output, 6); + } + + private void unrolledBitPack56(long[] input, int offset, int len, + OutputStream output) throws IOException { + unrolledBitPackBytes(input, offset, len, output, 7); + } + + private void unrolledBitPack64(long[] input, int offset, int len, + OutputStream output) throws IOException { + unrolledBitPackBytes(input, offset, len, output, 8); + } + + private void unrolledBitPackBytes(long[] input, int offset, int len, OutputStream output, int numBytes) throws IOException { + final int numHops = 8; + final int remainder = len % numHops; + final int endOffset = offset + len; + final int endUnroll = endOffset - remainder; + int i = offset; + for (; i < endUnroll; i = i + numHops) { + writeLongBE(output, input, i, numHops, numBytes); + } + + if (remainder > 0) { + writeRemainingLongs(output, i, input, remainder, numBytes); + } + } + + private void writeRemainingLongs(OutputStream output, int offset, long[] input, int remainder, + int numBytes) throws IOException { + final int numHops = remainder; + + int idx = 0; + switch (numBytes) { + case 1: + while (remainder > 0) { + writeBuffer[idx] = (byte) (input[offset + idx] & 255); + remainder--; + idx++; + } + break; + case 2: + while (remainder > 0) { + writeLongBE2(output, input[offset + idx], idx * 2); + remainder--; + idx++; + } + break; + case 3: + while (remainder > 0) { + writeLongBE3(output, input[offset + idx], idx * 3); + remainder--; + idx++; + } + break; + case 4: + while (remainder > 0) { + writeLongBE4(output, input[offset + idx], idx * 4); + remainder--; + idx++; + } + break; + case 5: + while (remainder > 0) { + writeLongBE5(output, input[offset + idx], idx * 5); + remainder--; + idx++; + } + break; + case 6: + while (remainder > 0) { + writeLongBE6(output, input[offset + idx], idx * 6); + remainder--; + idx++; + } + break; + case 7: + while (remainder > 0) { + writeLongBE7(output, input[offset + idx], idx * 7); + remainder--; + idx++; + } + break; + case 8: + while (remainder > 0) { + writeLongBE8(output, input[offset + idx], idx * 8); + remainder--; + idx++; + } + break; + default: + break; + } + + final int toWrite = numHops * numBytes; + output.write(writeBuffer, 0, toWrite); + } + + private void writeLongBE(OutputStream output, long[] input, int offset, int numHops, int numBytes) throws IOException { + + switch (numBytes) { + case 1: + writeBuffer[0] = (byte) (input[offset + 0] & 255); + writeBuffer[1] = (byte) (input[offset + 1] & 255); + writeBuffer[2] = (byte) (input[offset + 2] & 255); + writeBuffer[3] = (byte) (input[offset + 3] & 255); + writeBuffer[4] = (byte) (input[offset + 4] & 255); + writeBuffer[5] = (byte) (input[offset + 5] & 255); + writeBuffer[6] = (byte) (input[offset + 6] & 255); + writeBuffer[7] = (byte) (input[offset + 7] & 255); + break; + case 2: + writeLongBE2(output, input[offset + 0], 0); + writeLongBE2(output, input[offset + 1], 2); + writeLongBE2(output, input[offset + 2], 4); + writeLongBE2(output, input[offset + 3], 6); + writeLongBE2(output, input[offset + 4], 8); + writeLongBE2(output, input[offset + 5], 10); + writeLongBE2(output, input[offset + 6], 12); + writeLongBE2(output, input[offset + 7], 14); + break; + case 3: + writeLongBE3(output, input[offset + 0], 0); + writeLongBE3(output, input[offset + 1], 3); + writeLongBE3(output, input[offset + 2], 6); + writeLongBE3(output, input[offset + 3], 9); + writeLongBE3(output, input[offset + 4], 12); + writeLongBE3(output, input[offset + 5], 15); + writeLongBE3(output, input[offset + 6], 18); + writeLongBE3(output, input[offset + 7], 21); + break; + case 4: + writeLongBE4(output, input[offset + 0], 0); + writeLongBE4(output, input[offset + 1], 4); + writeLongBE4(output, input[offset + 2], 8); + writeLongBE4(output, input[offset + 3], 12); + writeLongBE4(output, input[offset + 4], 16); + writeLongBE4(output, input[offset + 5], 20); + writeLongBE4(output, input[offset + 6], 24); + writeLongBE4(output, input[offset + 7], 28); + break; + case 5: + writeLongBE5(output, input[offset + 0], 0); + writeLongBE5(output, input[offset + 1], 5); + writeLongBE5(output, input[offset + 2], 10); + writeLongBE5(output, input[offset + 3], 15); + writeLongBE5(output, input[offset + 4], 20); + writeLongBE5(output, input[offset + 5], 25); + writeLongBE5(output, input[offset + 6], 30); + writeLongBE5(output, input[offset + 7], 35); + break; + case 6: + writeLongBE6(output, input[offset + 0], 0); + writeLongBE6(output, input[offset + 1], 6); + writeLongBE6(output, input[offset + 2], 12); + writeLongBE6(output, input[offset + 3], 18); + writeLongBE6(output, input[offset + 4], 24); + writeLongBE6(output, input[offset + 5], 30); + writeLongBE6(output, input[offset + 6], 36); + writeLongBE6(output, input[offset + 7], 42); + break; + case 7: + writeLongBE7(output, input[offset + 0], 0); + writeLongBE7(output, input[offset + 1], 7); + writeLongBE7(output, input[offset + 2], 14); + writeLongBE7(output, input[offset + 3], 21); + writeLongBE7(output, input[offset + 4], 28); + writeLongBE7(output, input[offset + 5], 35); + writeLongBE7(output, input[offset + 6], 42); + writeLongBE7(output, input[offset + 7], 49); + break; + case 8: + writeLongBE8(output, input[offset + 0], 0); + writeLongBE8(output, input[offset + 1], 8); + writeLongBE8(output, input[offset + 2], 16); + writeLongBE8(output, input[offset + 3], 24); + writeLongBE8(output, input[offset + 4], 32); + writeLongBE8(output, input[offset + 5], 40); + writeLongBE8(output, input[offset + 6], 48); + writeLongBE8(output, input[offset + 7], 56); + break; + default: + break; + } + + final int toWrite = numHops * numBytes; + output.write(writeBuffer, 0, toWrite); + } + + private void writeLongBE2(OutputStream output, long val, int wbOffset) { + writeBuffer[wbOffset + 0] = (byte) (val >>> 8); + writeBuffer[wbOffset + 1] = (byte) (val >>> 0); + } + + private void writeLongBE3(OutputStream output, long val, int wbOffset) { + writeBuffer[wbOffset + 0] = (byte) (val >>> 16); + writeBuffer[wbOffset + 1] = (byte) (val >>> 8); + writeBuffer[wbOffset + 2] = (byte) (val >>> 0); + } + + private void writeLongBE4(OutputStream output, long val, int wbOffset) { + writeBuffer[wbOffset + 0] = (byte) (val >>> 24); + writeBuffer[wbOffset + 1] = (byte) (val >>> 16); + writeBuffer[wbOffset + 2] = (byte) (val >>> 8); + writeBuffer[wbOffset + 3] = (byte) (val >>> 0); + } + + private void writeLongBE5(OutputStream output, long val, int wbOffset) { + writeBuffer[wbOffset + 0] = (byte) (val >>> 32); + writeBuffer[wbOffset + 1] = (byte) (val >>> 24); + writeBuffer[wbOffset + 2] = (byte) (val >>> 16); + writeBuffer[wbOffset + 3] = (byte) (val >>> 8); + writeBuffer[wbOffset + 4] = (byte) (val >>> 0); + } + + private void writeLongBE6(OutputStream output, long val, int wbOffset) { + writeBuffer[wbOffset + 0] = (byte) (val >>> 40); + writeBuffer[wbOffset + 1] = (byte) (val >>> 32); + writeBuffer[wbOffset + 2] = (byte) (val >>> 24); + writeBuffer[wbOffset + 3] = (byte) (val >>> 16); + writeBuffer[wbOffset + 4] = (byte) (val >>> 8); + writeBuffer[wbOffset + 5] = (byte) (val >>> 0); + } + + private void writeLongBE7(OutputStream output, long val, int wbOffset) { + writeBuffer[wbOffset + 0] = (byte) (val >>> 48); + writeBuffer[wbOffset + 1] = (byte) (val >>> 40); + writeBuffer[wbOffset + 2] = (byte) (val >>> 32); + writeBuffer[wbOffset + 3] = (byte) (val >>> 24); + writeBuffer[wbOffset + 4] = (byte) (val >>> 16); + writeBuffer[wbOffset + 5] = (byte) (val >>> 8); + writeBuffer[wbOffset + 6] = (byte) (val >>> 0); + } + + private void writeLongBE8(OutputStream output, long val, int wbOffset) { + writeBuffer[wbOffset + 0] = (byte) (val >>> 56); + writeBuffer[wbOffset + 1] = (byte) (val >>> 48); + writeBuffer[wbOffset + 2] = (byte) (val >>> 40); + writeBuffer[wbOffset + 3] = (byte) (val >>> 32); + writeBuffer[wbOffset + 4] = (byte) (val >>> 24); + writeBuffer[wbOffset + 5] = (byte) (val >>> 16); + writeBuffer[wbOffset + 6] = (byte) (val >>> 8); + writeBuffer[wbOffset + 7] = (byte) (val >>> 0); + } + + // Do not want to use Guava LongMath.checkedSubtract() here as it will throw + // ArithmeticException in case of overflow + public boolean isSafeSubtract(long left, long right) { + return (left ^ right) >= 0 | (left ^ (left - right)) >= 0; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java new file mode 100644 index 0000000000..285a32aeb8 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/SnappyCodec.java @@ -0,0 +1,109 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.hive.shims.HadoopShims.DirectCompressionType; +import org.apache.hadoop.hive.shims.HadoopShims.DirectDecompressorShim; +import org.apache.hadoop.hive.shims.ShimLoader; +import org.iq80.snappy.Snappy; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.EnumSet; + +class SnappyCodec implements CompressionCodec, DirectDecompressionCodec { + + Boolean direct = null; + + @Override + public boolean compress(ByteBuffer in, ByteBuffer out, + ByteBuffer overflow) throws IOException { + int inBytes = in.remaining(); + // I should work on a patch for Snappy to support an overflow buffer + // to prevent the extra buffer copy. + byte[] compressed = new byte[Snappy.maxCompressedLength(inBytes)]; + int outBytes = + Snappy.compress(in.array(), in.arrayOffset() + in.position(), inBytes, + compressed, 0); + if (outBytes < inBytes) { + int remaining = out.remaining(); + if (remaining >= outBytes) { + System.arraycopy(compressed, 0, out.array(), out.arrayOffset() + + out.position(), outBytes); + out.position(out.position() + outBytes); + } else { + System.arraycopy(compressed, 0, out.array(), out.arrayOffset() + + out.position(), remaining); + out.position(out.limit()); + System.arraycopy(compressed, remaining, overflow.array(), + overflow.arrayOffset(), outBytes - remaining); + overflow.position(outBytes - remaining); + } + return true; + } else { + return false; + } + } + + @Override + public void decompress(ByteBuffer in, ByteBuffer out) throws IOException { + if(in.isDirect() && out.isDirect()) { + directDecompress(in, out); + return; + } + int inOffset = in.position(); + int uncompressLen = + Snappy.uncompress(in.array(), in.arrayOffset() + inOffset, + in.limit() - inOffset, out.array(), out.arrayOffset() + out.position()); + out.position(uncompressLen + out.position()); + out.flip(); + } + + @Override + public boolean isAvailable() { + if (direct == null) { + try { + if (ShimLoader.getHadoopShims().getDirectDecompressor( + DirectCompressionType.SNAPPY) != null) { + direct = Boolean.valueOf(true); + } else { + direct = Boolean.valueOf(false); + } + } catch (UnsatisfiedLinkError ule) { + direct = Boolean.valueOf(false); + } + } + return direct.booleanValue(); + } + + @Override + public void directDecompress(ByteBuffer in, ByteBuffer out) + throws IOException { + DirectDecompressorShim decompressShim = ShimLoader.getHadoopShims() + .getDirectDecompressor(DirectCompressionType.SNAPPY); + decompressShim.decompress(in, out); + out.flip(); // flip for read + } + + @Override + public CompressionCodec modify(EnumSet modifiers) { + // snappy allows no modifications + return this; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java new file mode 100644 index 0000000000..382164530c --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StreamName.java @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +/** + * The name of a stream within a stripe. + */ +class StreamName implements Comparable { + private final int column; + private final OrcProto.Stream.Kind kind; + + public enum Area { + DATA, INDEX + } + + public StreamName(int column, OrcProto.Stream.Kind kind) { + this.column = column; + this.kind = kind; + } + + public boolean equals(Object obj) { + if (obj != null && obj instanceof StreamName) { + StreamName other = (StreamName) obj; + return other.column == column && other.kind == kind; + } else { + return false; + } + } + + @Override + public int compareTo(StreamName streamName) { + if (streamName == null) { + return -1; + } + Area area = getArea(kind); + Area otherArea = StreamName.getArea(streamName.kind); + if (area != otherArea) { + return -area.compareTo(otherArea); + } + if (column != streamName.column) { + return column < streamName.column ? -1 : 1; + } + return kind.compareTo(streamName.kind); + } + + public int getColumn() { + return column; + } + + public OrcProto.Stream.Kind getKind() { + return kind; + } + + public Area getArea() { + return getArea(kind); + } + + public static Area getArea(OrcProto.Stream.Kind kind) { + switch (kind) { + case ROW_INDEX: + case DICTIONARY_COUNT: + case BLOOM_FILTER: + return Area.INDEX; + default: + return Area.DATA; + } + } + + @Override + public String toString() { + return "Stream for column " + column + " kind " + kind; + } + + @Override + public int hashCode() { + return column * 101 + kind.getNumber(); + } +} + diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java new file mode 100644 index 0000000000..42486646bf --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringColumnStatistics.java @@ -0,0 +1,41 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +/** + * Statistics for string columns. + */ +public interface StringColumnStatistics extends ColumnStatistics { + /** + * Get the minimum string. + * @return the minimum + */ + String getMinimum(); + + /** + * Get the maximum string. + * @return the maximum + */ + String getMaximum(); + + /** + * Get the total length of all strings + * @return the sum (total length) + */ + long getSum(); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java new file mode 100644 index 0000000000..7c698d14a5 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java @@ -0,0 +1,202 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.io.Text; + +import java.io.IOException; +import java.io.OutputStream; + +/** + * A red-black tree that stores strings. The strings are stored as UTF-8 bytes + * and an offset for each entry. + */ +class StringRedBlackTree extends RedBlackTree { + private final DynamicByteArray byteArray = new DynamicByteArray(); + private final DynamicIntArray keyOffsets; + private final Text newKey = new Text(); + + public StringRedBlackTree(int initialCapacity) { + super(initialCapacity); + keyOffsets = new DynamicIntArray(initialCapacity); + } + + public int add(String value) { + newKey.set(value); + return addNewKey(); + } + + private int addNewKey() { + // if the newKey is actually new, add it to our byteArray and store the offset & length + if (add()) { + int len = newKey.getLength(); + keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len)); + } + return lastAdd; + } + + public int add(Text value) { + newKey.set(value); + return addNewKey(); + } + + @Override + protected int compareValue(int position) { + int start = keyOffsets.get(position); + int end; + if (position + 1 == keyOffsets.size()) { + end = byteArray.size(); + } else { + end = keyOffsets.get(position+1); + } + return byteArray.compare(newKey.getBytes(), 0, newKey.getLength(), + start, end - start); + } + + /** + * The information about each node. + */ + public interface VisitorContext { + /** + * Get the position where the key was originally added. + * @return the number returned by add. + */ + int getOriginalPosition(); + + /** + * Write the bytes for the string to the given output stream. + * @param out the stream to write to. + * @throws IOException + */ + void writeBytes(OutputStream out) throws IOException; + + /** + * Get the original string. + * @return the string + */ + Text getText(); + + /** + * Get the number of bytes. + * @return the string's length in bytes + */ + int getLength(); + } + + /** + * The interface for visitors. + */ + public interface Visitor { + /** + * Called once for each node of the tree in sort order. + * @param context the information about each node + * @throws IOException + */ + void visit(VisitorContext context) throws IOException; + } + + private class VisitorContextImpl implements VisitorContext { + private int originalPosition; + private int start; + private int end; + private final Text text = new Text(); + + public int getOriginalPosition() { + return originalPosition; + } + + public Text getText() { + byteArray.setText(text, start, end - start); + return text; + } + + public void writeBytes(OutputStream out) throws IOException { + byteArray.write(out, start, end - start); + } + + public int getLength() { + return end - start; + } + + void setPosition(int position) { + originalPosition = position; + start = keyOffsets.get(originalPosition); + if (position + 1 == keyOffsets.size()) { + end = byteArray.size(); + } else { + end = keyOffsets.get(originalPosition + 1); + } + } + } + + private void recurse(int node, Visitor visitor, VisitorContextImpl context + ) throws IOException { + if (node != NULL) { + recurse(getLeft(node), visitor, context); + context.setPosition(node); + visitor.visit(context); + recurse(getRight(node), visitor, context); + } + } + + /** + * Visit all of the nodes in the tree in sorted order. + * @param visitor the action to be applied to each node + * @throws IOException + */ + public void visit(Visitor visitor) throws IOException { + recurse(root, visitor, new VisitorContextImpl()); + } + + /** + * Reset the table to empty. + */ + public void clear() { + super.clear(); + byteArray.clear(); + keyOffsets.clear(); + } + + public void getText(Text result, int originalPosition) { + int offset = keyOffsets.get(originalPosition); + int length; + if (originalPosition + 1 == keyOffsets.size()) { + length = byteArray.size() - offset; + } else { + length = keyOffsets.get(originalPosition + 1) - offset; + } + byteArray.setText(result, offset, length); + } + + /** + * Get the size of the character data in the table. + * @return the bytes used by the table + */ + public int getCharacterSize() { + return byteArray.size(); + } + + /** + * Calculate the approximate size in memory. + * @return the number of bytes used in storing the tree. + */ + public long getSizeInBytes() { + return byteArray.getSizeInBytes() + keyOffsets.getSizeInBytes() + + super.getSizeInBytes(); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java new file mode 100644 index 0000000000..62819c1a22 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeInformation.java @@ -0,0 +1,59 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +/** + * Information about the stripes in an ORC file that is provided by the Reader. + */ +public interface StripeInformation { + /** + * Get the byte offset of the start of the stripe. + * @return the bytes from the start of the file + */ + long getOffset(); + + /** + * Get the total length of the stripe in bytes. + * @return the number of bytes in the stripe + */ + long getLength(); + + /** + * Get the length of the stripe's indexes. + * @return the number of bytes in the index + */ + long getIndexLength(); + + /** + * Get the length of the stripe's data. + * @return the number of bytes in the stripe + */ + long getDataLength(); + + /** + * Get the length of the stripe's tail section, which contains its index. + * @return the number of bytes in the tail + */ + long getFooterLength(); + + /** + * Get the number of rows in the stripe. + * @return a count of the number of rows + */ + long getNumberOfRows(); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java new file mode 100644 index 0000000000..013fc8ec80 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StripeStatistics.java @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import java.util.List; + +public class StripeStatistics { + private final List cs; + + StripeStatistics(List list) { + this.cs = list; + } + + /** + * Return list of column statistics + * + * @return column stats + */ + public ColumnStatistics[] getColumnStatistics() { + ColumnStatistics[] result = new ColumnStatistics[cs.size()]; + for (int i = 0; i < result.length; ++i) { + result[i] = ColumnStatisticsImpl.deserialize(cs.get(i)); + } + return result; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java new file mode 100644 index 0000000000..6fad0ac1fe --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/TimestampColumnStatistics.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import java.sql.Timestamp; + +/** + * Statistics for Timestamp columns. + */ +public interface TimestampColumnStatistics extends ColumnStatistics { + /** + * Get the minimum value for the column. + * @return minimum value + */ + Timestamp getMinimum(); + + /** + * Get the maximum value for the column. + * @return maximum value + */ + Timestamp getMaximum(); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/VectorizedOrcSerde.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/VectorizedOrcSerde.java new file mode 100644 index 0000000000..aea1d89e4d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/VectorizedOrcSerde.java @@ -0,0 +1,88 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.hadoop.io.ObjectWritable; +import org.apache.hadoop.io.Writable; + +/** + * A serde class for ORC. + * It transparently passes the object to/from the ORC file reader/writer. + */ +public class VectorizedOrcSerde extends OrcSerde { + private final OrcStruct[] orcStructArray = new OrcStruct[VectorizedRowBatch.DEFAULT_SIZE]; + private final Writable [] orcRowArray = new Writable [VectorizedRowBatch.DEFAULT_SIZE]; + private final ObjectWritable ow = new ObjectWritable(); + private final ObjectInspector inspector = null; + private final VectorExpressionWriter[] valueWriters; + + public VectorizedOrcSerde(ObjectInspector objInspector) { + super(); + for (int i = 0; i < orcStructArray.length; i++) { + orcRowArray[i] = new OrcSerde.OrcSerdeRow(); + } + try { + valueWriters = VectorExpressionWriterFactory + .getExpressionWriters((StructObjectInspector) objInspector); + } catch (HiveException e) { + throw new RuntimeException(e); + } + } + + + @Override + public Writable serialize(Object obj, ObjectInspector inspector) { + VectorizedRowBatch batch = (VectorizedRowBatch) obj; + try { + for (int i = 0; i < batch.size; i++) { + OrcStruct ost = orcStructArray[i]; + if (ost == null) { + ost = new OrcStruct(batch.numCols); + orcStructArray[i] = ost; + } + int index = 0; + if (batch.selectedInUse) { + index = batch.selected[i]; + } else { + index = i; + } + for (int p = 0; p < batch.projectionSize; p++) { + int k = batch.projectedColumns[p]; + if (batch.cols[k].isRepeating) { + valueWriters[p].setValue(ost, batch.cols[k], 0); + } else { + valueWriters[p].setValue(ost, batch.cols[k], index); + } + } + OrcSerde.OrcSerdeRow row = (OrcSerde.OrcSerdeRow) orcRowArray[i]; + row.realRow = ost; + row.inspector = inspector; + } + } catch (HiveException ex) { + throw new RuntimeException(ex); + } + ow.set(orcRowArray); + return ow; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java new file mode 100644 index 0000000000..d8dae5343d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java @@ -0,0 +1,102 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.List; + +/** + * The interface for writing ORC files. + */ +public interface Writer { + /** + * Add arbitrary meta-data to the ORC file. This may be called at any point + * until the Writer is closed. If the same key is passed a second time, the + * second value will replace the first. + * @param key a key to label the data with. + * @param value the contents of the metadata. + */ + void addUserMetadata(String key, ByteBuffer value); + + /** + * Add a row to the ORC file. + * @param row the row to add + * @throws IOException + */ + void addRow(Object row) throws IOException; + + /** + * Flush all of the buffers and close the file. No methods on this writer + * should be called afterwards. + * @throws IOException + */ + void close() throws IOException; + + /** + * Return the deserialized data size. Raw data size will be compute when + * writing the file footer. Hence raw data size value will be available only + * after closing the writer. + * + * @return raw data size + */ + long getRawDataSize(); + + /** + * Return the number of rows in file. Row count gets updated when flushing + * the stripes. To get accurate row count this method should be called after + * closing the writer. + * + * @return row count + */ + long getNumberOfRows(); + + /** + * Write an intermediate footer on the file such that if the file is + * truncated to the returned offset, it would be a valid ORC file. + * @return the offset that would be a valid end location for an ORC file + */ + long writeIntermediateFooter() throws IOException; + + /** + * Fast stripe append to ORC file. This interface is used for fast ORC file + * merge with other ORC files. When merging, the file to be merged should pass + * stripe in binary form along with stripe information and stripe statistics. + * After appending last stripe of a file, use appendUserMetadata() to append + * any user metadata. + * @param stripe - stripe as byte array + * @param offset - offset within byte array + * @param length - length of stripe within byte array + * @param stripeInfo - stripe information + * @param stripeStatistics - stripe statistics (Protobuf objects can be + * merged directly) + * @throws IOException + */ + public void appendStripe(byte[] stripe, int offset, int length, + StripeInformation stripeInfo, + OrcProto.StripeStatistics stripeStatistics) throws IOException; + + /** + * When fast stripe append is used for merging ORC stripes, after appending + * the last stripe from a file, this interface must be used to merge any + * user metadata. + * @param userMetadata - user metadata + */ + public void appendUserMetadata(List userMetadata); +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java new file mode 100644 index 0000000000..4753f1b321 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -0,0 +1,2524 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Joiner; +import com.google.common.collect.Lists; +import com.google.common.primitives.Longs; +import com.google.protobuf.ByteString; +import com.google.protobuf.CodedOutputStream; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.common.type.HiveDecimal; +import org.apache.hadoop.hive.ql.io.IOConstants; +import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.tajo.storage.thirdparty.orc.CompressionCodec.Modifier; +import org.apache.tajo.storage.thirdparty.orc.OrcProto.RowIndexEntry; +import org.apache.tajo.storage.thirdparty.orc.OrcProto.StripeStatistics; +import org.apache.tajo.storage.thirdparty.orc.OrcProto.Type; +import org.apache.tajo.storage.thirdparty.orc.OrcProto.UserMetadataItem; +import org.apache.hadoop.hive.ql.util.JavaDataModel; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.objectinspector.*; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.*; +import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; +import org.apache.hadoop.io.BytesWritable; +import org.apache.hadoop.io.Text; + +import java.io.IOException; +import java.io.OutputStream; +import java.lang.management.ManagementFactory; +import java.nio.ByteBuffer; +import java.sql.Timestamp; +import java.util.*; + +import static com.google.common.base.Preconditions.checkArgument; + +/** + * An ORC file writer. The file is divided into stripes, which is the natural + * unit of work when reading. Each stripe is buffered in memory until the + * memory reaches the stripe size and then it is written out broken down by + * columns. Each column is written by a TreeWriter that is specific to that + * type of column. TreeWriters may have children TreeWriters that handle the + * sub-types. Each of the TreeWriters writes the column's data as a set of + * streams. + * + * This class is synchronized so that multi-threaded access is ok. In + * particular, because the MemoryManager is shared between writers, this class + * assumes that checkMemory may be called from a separate thread. + */ +public class WriterImpl implements Writer, MemoryManager.Callback { + + private static final Log LOG = LogFactory.getLog(WriterImpl.class); + + private static final int HDFS_BUFFER_SIZE = 256 * 1024; + private static final int MIN_ROW_INDEX_STRIDE = 1000; + + // threshold above which buffer size will be automatically resized + private static final int COLUMN_COUNT_THRESHOLD = 1000; + + private final FileSystem fs; + private final Path path; + private final long defaultStripeSize; + private long adjustedStripeSize; + private final int rowIndexStride; + private final CompressionKind compress; + private final CompressionCodec codec; + private final boolean addBlockPadding; + private final int bufferSize; + private final long blockSize; + private final float paddingTolerance; + // the streams that make up the current stripe + private final Map streams = + new TreeMap(); + + private FSDataOutputStream rawWriter = null; + // the compressed metadata information outStream + private OutStream writer = null; + // a protobuf outStream around streamFactory + private CodedOutputStream protobufWriter = null; + private long headerLength; + private int columnCount; + private long rowCount = 0; + private long rowsInStripe = 0; + private long rawDataSize = 0; + private int rowsInIndex = 0; + private int stripesAtLastFlush = -1; + private final List stripes = + new ArrayList(); + private final Map userMetadata = + new TreeMap(); + private final StreamFactory streamFactory = new StreamFactory(); + private final TreeWriter treeWriter; + private final boolean buildIndex; + private final MemoryManager memoryManager; + private final OrcFile.Version version; + private final Configuration conf; + private final OrcFile.WriterCallback callback; + private final OrcFile.WriterContext callbackContext; + private final OrcFile.EncodingStrategy encodingStrategy; + private final OrcFile.CompressionStrategy compressionStrategy; + private final boolean[] bloomFilterColumns; + private final double bloomFilterFpp; + + WriterImpl(FileSystem fs, + Path path, + Configuration conf, + ObjectInspector inspector, + long stripeSize, + CompressionKind compress, + int bufferSize, + int rowIndexStride, + MemoryManager memoryManager, + boolean addBlockPadding, + OrcFile.Version version, + OrcFile.WriterCallback callback, + OrcFile.EncodingStrategy encodingStrategy, + OrcFile.CompressionStrategy compressionStrategy, + float paddingTolerance, + long blockSizeValue, + String bloomFilterColumnNames, + double bloomFilterFpp) throws IOException { + this.fs = fs; + this.path = path; + this.conf = conf; + this.callback = callback; + if (callback != null) { + callbackContext = new OrcFile.WriterContext(){ + + @Override + public Writer getWriter() { + return WriterImpl.this; + } + }; + } else { + callbackContext = null; + } + this.adjustedStripeSize = stripeSize; + this.defaultStripeSize = stripeSize; + this.version = version; + this.encodingStrategy = encodingStrategy; + this.compressionStrategy = compressionStrategy; + this.addBlockPadding = addBlockPadding; + this.blockSize = blockSizeValue; + this.paddingTolerance = paddingTolerance; + this.compress = compress; + this.rowIndexStride = rowIndexStride; + this.memoryManager = memoryManager; + buildIndex = rowIndexStride > 0; + codec = createCodec(compress); + String allColumns = conf.get(IOConstants.COLUMNS); + if (allColumns == null) { + allColumns = getColumnNamesFromInspector(inspector); + } + this.bufferSize = getEstimatedBufferSize(allColumns, bufferSize); + if (version == OrcFile.Version.V_0_11) { + /* do not write bloom filters for ORC v11 */ + this.bloomFilterColumns = + OrcUtils.includeColumns(null, allColumns, inspector); + } else { + this.bloomFilterColumns = + OrcUtils.includeColumns(bloomFilterColumnNames, allColumns, inspector); + } + this.bloomFilterFpp = bloomFilterFpp; + treeWriter = createTreeWriter(inspector, streamFactory, false); + if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) { + throw new IllegalArgumentException("Row stride must be at least " + + MIN_ROW_INDEX_STRIDE); + } + + // ensure that we are able to handle callbacks before we register ourselves + memoryManager.addWriter(path, stripeSize, this); + } + + private String getColumnNamesFromInspector(ObjectInspector inspector) { + List fieldNames = Lists.newArrayList(); + Joiner joiner = Joiner.on(","); + if (inspector instanceof StructObjectInspector) { + StructObjectInspector soi = (StructObjectInspector) inspector; + List fields = soi.getAllStructFieldRefs(); + for(StructField sf : fields) { + fieldNames.add(sf.getFieldName()); + } + } + return joiner.join(fieldNames); + } + + @VisibleForTesting + int getEstimatedBufferSize(int bs) { + return getEstimatedBufferSize(conf.get(IOConstants.COLUMNS), bs); + } + + int getEstimatedBufferSize(String colNames, int bs) { + long availableMem = getMemoryAvailableForORC(); + if (colNames != null) { + final int numCols = colNames.split(",").length; + if (numCols > COLUMN_COUNT_THRESHOLD) { + // In BufferedStream, there are 3 outstream buffers (compressed, + // uncompressed and overflow) and list of previously compressed buffers. + // Since overflow buffer is rarely used, lets consider only 2 allocation. + // Also, initially, the list of compression buffers will be empty. + final int outStreamBuffers = codec == null ? 1 : 2; + + // max possible streams per column is 5. For string columns, there is + // ROW_INDEX, PRESENT, DATA, LENGTH, DICTIONARY_DATA streams. + final int maxStreams = 5; + + // Lets assume 10% memory for holding dictionary in memory and other + // object allocations + final long miscAllocation = (long) (0.1f * availableMem); + + // compute the available memory + final long remainingMem = availableMem - miscAllocation; + + int estBufferSize = (int) (remainingMem / + (maxStreams * outStreamBuffers * numCols)); + estBufferSize = getClosestBufferSize(estBufferSize, bs); + if (estBufferSize > bs) { + estBufferSize = bs; + } + + LOG.info("WIDE TABLE - Number of columns: " + numCols + + " Chosen compression buffer size: " + estBufferSize); + return estBufferSize; + } + } + return bs; + } + + private int getClosestBufferSize(int estBufferSize, int bs) { + final int kb4 = 4 * 1024; + final int kb8 = 8 * 1024; + final int kb16 = 16 * 1024; + final int kb32 = 32 * 1024; + final int kb64 = 64 * 1024; + final int kb128 = 128 * 1024; + final int kb256 = 256 * 1024; + if (estBufferSize <= kb4) { + return kb4; + } else if (estBufferSize > kb4 && estBufferSize <= kb8) { + return kb8; + } else if (estBufferSize > kb8 && estBufferSize <= kb16) { + return kb16; + } else if (estBufferSize > kb16 && estBufferSize <= kb32) { + return kb32; + } else if (estBufferSize > kb32 && estBufferSize <= kb64) { + return kb64; + } else if (estBufferSize > kb64 && estBufferSize <= kb128) { + return kb128; + } else { + return kb256; + } + } + + // the assumption is only one ORC writer open at a time, which holds true for + // most of the cases. HIVE-6455 forces single writer case. + private long getMemoryAvailableForORC() { + OrcConf.ConfVars poolVar = OrcConf.ConfVars.HIVE_ORC_FILE_MEMORY_POOL; + double maxLoad = conf.getFloat(poolVar.varname, poolVar.defaultFloatVal); + long totalMemoryPool = Math.round(ManagementFactory.getMemoryMXBean(). + getHeapMemoryUsage().getMax() * maxLoad); + return totalMemoryPool; + } + + public static CompressionCodec createCodec(CompressionKind kind) { + switch (kind) { + case NONE: + return null; + case ZLIB: + return new ZlibCodec(); + case SNAPPY: + return new SnappyCodec(); + case LZO: + try { + Class lzo = + (Class) + Class.forName("org.apache.hadoop.hive.ql.io.orc.LzoCodec"); + return lzo.newInstance(); + } catch (ClassNotFoundException e) { + throw new IllegalArgumentException("LZO is not available.", e); + } catch (InstantiationException e) { + throw new IllegalArgumentException("Problem initializing LZO", e); + } catch (IllegalAccessException e) { + throw new IllegalArgumentException("Insufficient access to LZO", e); + } + default: + throw new IllegalArgumentException("Unknown compression codec: " + + kind); + } + } + + @Override + public synchronized boolean checkMemory(double newScale) throws IOException { + long limit = (long) Math.round(adjustedStripeSize * newScale); + long size = estimateStripeSize(); + if (LOG.isDebugEnabled()) { + LOG.debug("ORC writer " + path + " size = " + size + " limit = " + + limit); + } + if (size > limit) { + flushStripe(); + return true; + } + return false; + } + + /** + * This class is used to hold the contents of streams as they are buffered. + * The TreeWriters write to the outStream and the codec compresses the + * data as buffers fill up and stores them in the output list. When the + * stripe is being written, the whole stream is written to the file. + */ + private class BufferedStream implements OutStream.OutputReceiver { + private final OutStream outStream; + private final List output = new ArrayList(); + + BufferedStream(String name, int bufferSize, + CompressionCodec codec) throws IOException { + outStream = new OutStream(name, bufferSize, codec, this); + } + + /** + * Receive a buffer from the compression codec. + * @param buffer the buffer to save + * @throws IOException + */ + @Override + public void output(ByteBuffer buffer) { + output.add(buffer); + } + + /** + * Get the number of bytes in buffers that are allocated to this stream. + * @return number of bytes in buffers + */ + public long getBufferSize() { + long result = 0; + for(ByteBuffer buf: output) { + result += buf.capacity(); + } + return outStream.getBufferSize() + result; + } + + /** + * Flush the stream to the codec. + * @throws IOException + */ + public void flush() throws IOException { + outStream.flush(); + } + + /** + * Clear all of the buffers. + * @throws IOException + */ + public void clear() throws IOException { + outStream.clear(); + output.clear(); + } + + /** + * Check the state of suppress flag in output stream + * @return value of suppress flag + */ + public boolean isSuppressed() { + return outStream.isSuppressed(); + } + + /** + * Get the number of bytes that will be written to the output. Assumes + * the stream has already been flushed. + * @return the number of bytes + */ + public long getOutputSize() { + long result = 0; + for(ByteBuffer buffer: output) { + result += buffer.remaining(); + } + return result; + } + + /** + * Write the saved compressed buffers to the OutputStream. + * @param out the stream to write to + * @throws IOException + */ + void spillTo(OutputStream out) throws IOException { + for(ByteBuffer buffer: output) { + out.write(buffer.array(), buffer.arrayOffset() + buffer.position(), + buffer.remaining()); + } + } + + @Override + public String toString() { + return outStream.toString(); + } + } + + /** + * An output receiver that writes the ByteBuffers to the output stream + * as they are received. + */ + private class DirectStream implements OutStream.OutputReceiver { + private final FSDataOutputStream output; + + DirectStream(FSDataOutputStream output) { + this.output = output; + } + + @Override + public void output(ByteBuffer buffer) throws IOException { + output.write(buffer.array(), buffer.arrayOffset() + buffer.position(), + buffer.remaining()); + } + } + + private static class RowIndexPositionRecorder implements PositionRecorder { + private final OrcProto.RowIndexEntry.Builder builder; + + RowIndexPositionRecorder(OrcProto.RowIndexEntry.Builder builder) { + this.builder = builder; + } + + @Override + public void addPosition(long position) { + builder.addPositions(position); + } + } + + /** + * Interface from the Writer to the TreeWriters. This limits the visibility + * that the TreeWriters have into the Writer. + */ + private class StreamFactory { + /** + * Create a stream to store part of a column. + * @param column the column id for the stream + * @param kind the kind of stream + * @return The output outStream that the section needs to be written to. + * @throws IOException + */ + public OutStream createStream(int column, + OrcProto.Stream.Kind kind + ) throws IOException { + final StreamName name = new StreamName(column, kind); + final EnumSet modifiers; + + switch (kind) { + case BLOOM_FILTER: + case DATA: + case DICTIONARY_DATA: + if (getCompressionStrategy() == OrcFile.CompressionStrategy.SPEED) { + modifiers = EnumSet.of(Modifier.FAST, Modifier.TEXT); + } else { + modifiers = EnumSet.of(Modifier.DEFAULT, Modifier.TEXT); + } + break; + case LENGTH: + case DICTIONARY_COUNT: + case PRESENT: + case ROW_INDEX: + case SECONDARY: + // easily compressed using the fastest modes + modifiers = EnumSet.of(Modifier.FASTEST, Modifier.BINARY); + break; + default: + LOG.warn("Missing ORC compression modifiers for " + kind); + modifiers = null; + break; + } + + BufferedStream result = streams.get(name); + if (result == null) { + result = new BufferedStream(name.toString(), bufferSize, + codec == null ? codec : codec.modify(modifiers)); + streams.put(name, result); + } + return result.outStream; + } + + /** + * Get the next column id. + * @return a number from 0 to the number of columns - 1 + */ + public int getNextColumnId() { + return columnCount++; + } + + /** + * Get the current column id. After creating all tree writers this count should tell how many + * columns (including columns within nested complex objects) are created in total. + * @return current column id + */ + public int getCurrentColumnId() { + return columnCount; + } + + /** + * Get the stride rate of the row index. + */ + public int getRowIndexStride() { + return rowIndexStride; + } + + /** + * Should be building the row index. + * @return true if we are building the index + */ + public boolean buildIndex() { + return buildIndex; + } + + /** + * Is the ORC file compressed? + * @return are the streams compressed + */ + public boolean isCompressed() { + return codec != null; + } + + /** + * Get the encoding strategy to use. + * @return encoding strategy + */ + public OrcFile.EncodingStrategy getEncodingStrategy() { + return encodingStrategy; + } + + /** + * Get the compression strategy to use. + * @return compression strategy + */ + public OrcFile.CompressionStrategy getCompressionStrategy() { + return compressionStrategy; + } + + /** + * Get the bloom filter columns + * @return bloom filter columns + */ + public boolean[] getBloomFilterColumns() { + return bloomFilterColumns; + } + + /** + * Get bloom filter false positive percentage. + * @return fpp + */ + public double getBloomFilterFPP() { + return bloomFilterFpp; + } + + /** + * Get the writer's configuration. + * @return configuration + */ + public Configuration getConfiguration() { + return conf; + } + + /** + * Get the version of the file to write. + */ + public OrcFile.Version getVersion() { + return version; + } + } + + /** + * The parent class of all of the writers for each column. Each column + * is written by an instance of this class. The compound types (struct, + * list, map, and union) have children tree writers that write the children + * types. + */ + private abstract static class TreeWriter { + protected final int id; + protected final ObjectInspector inspector; + private final BitFieldWriter isPresent; + private final boolean isCompressed; + protected final ColumnStatisticsImpl indexStatistics; + protected final ColumnStatisticsImpl stripeColStatistics; + private final ColumnStatisticsImpl fileStatistics; + protected TreeWriter[] childrenWriters; + protected final RowIndexPositionRecorder rowIndexPosition; + private final OrcProto.RowIndex.Builder rowIndex; + private final OrcProto.RowIndexEntry.Builder rowIndexEntry; + private final PositionedOutputStream rowIndexStream; + private final PositionedOutputStream bloomFilterStream; + protected final BloomFilterIO bloomFilter; + protected final boolean createBloomFilter; + private final OrcProto.BloomFilterIndex.Builder bloomFilterIndex; + private final OrcProto.BloomFilter.Builder bloomFilterEntry; + private boolean foundNulls; + private OutStream isPresentOutStream; + private final List stripeStatsBuilders; + + /** + * Create a tree writer. + * @param columnId the column id of the column to write + * @param inspector the object inspector to use + * @param streamFactory limited access to the Writer's data. + * @param nullable can the value be null? + * @throws IOException + */ + TreeWriter(int columnId, ObjectInspector inspector, + StreamFactory streamFactory, + boolean nullable) throws IOException { + this.isCompressed = streamFactory.isCompressed(); + this.id = columnId; + this.inspector = inspector; + if (nullable) { + isPresentOutStream = streamFactory.createStream(id, + OrcProto.Stream.Kind.PRESENT); + isPresent = new BitFieldWriter(isPresentOutStream, 1); + } else { + isPresent = null; + } + this.foundNulls = false; + createBloomFilter = streamFactory.getBloomFilterColumns()[columnId]; + indexStatistics = ColumnStatisticsImpl.create(inspector); + stripeColStatistics = ColumnStatisticsImpl.create(inspector); + fileStatistics = ColumnStatisticsImpl.create(inspector); + childrenWriters = new TreeWriter[0]; + rowIndex = OrcProto.RowIndex.newBuilder(); + rowIndexEntry = OrcProto.RowIndexEntry.newBuilder(); + rowIndexPosition = new RowIndexPositionRecorder(rowIndexEntry); + stripeStatsBuilders = Lists.newArrayList(); + if (streamFactory.buildIndex()) { + rowIndexStream = streamFactory.createStream(id, OrcProto.Stream.Kind.ROW_INDEX); + } else { + rowIndexStream = null; + } + if (createBloomFilter) { + bloomFilterEntry = OrcProto.BloomFilter.newBuilder(); + bloomFilterIndex = OrcProto.BloomFilterIndex.newBuilder(); + bloomFilterStream = streamFactory.createStream(id, OrcProto.Stream.Kind.BLOOM_FILTER); + bloomFilter = new BloomFilterIO(streamFactory.getRowIndexStride(), + streamFactory.getBloomFilterFPP()); + } else { + bloomFilterEntry = null; + bloomFilterIndex = null; + bloomFilterStream = null; + bloomFilter = null; + } + } + + protected OrcProto.RowIndex.Builder getRowIndex() { + return rowIndex; + } + + protected ColumnStatisticsImpl getStripeStatistics() { + return stripeColStatistics; + } + + protected ColumnStatisticsImpl getFileStatistics() { + return fileStatistics; + } + + protected OrcProto.RowIndexEntry.Builder getRowIndexEntry() { + return rowIndexEntry; + } + + IntegerWriter createIntegerWriter(PositionedOutputStream output, + boolean signed, boolean isDirectV2, + StreamFactory writer) { + if (isDirectV2) { + boolean alignedBitpacking = false; + if (writer.getEncodingStrategy().equals(OrcFile.EncodingStrategy.SPEED)) { + alignedBitpacking = true; + } + return new RunLengthIntegerWriterV2(output, signed, alignedBitpacking); + } else { + return new RunLengthIntegerWriter(output, signed); + } + } + + boolean isNewWriteFormat(StreamFactory writer) { + return writer.getVersion() != OrcFile.Version.V_0_11; + } + + /** + * Add a new value to the column. + * @param obj + * @throws IOException + */ + void write(Object obj) throws IOException { + if (obj != null) { + indexStatistics.increment(); + } else { + indexStatistics.setNull(); + } + if (isPresent != null) { + isPresent.write(obj == null ? 0 : 1); + if(obj == null) { + foundNulls = true; + } + } + } + + private void removeIsPresentPositions() { + for(int i=0; i < rowIndex.getEntryCount(); ++i) { + RowIndexEntry.Builder entry = rowIndex.getEntryBuilder(i); + List positions = entry.getPositionsList(); + // bit streams use 3 positions if uncompressed, 4 if compressed + positions = positions.subList(isCompressed ? 4 : 3, positions.size()); + entry.clearPositions(); + entry.addAllPositions(positions); + } + } + + /** + * Write the stripe out to the file. + * @param builder the stripe footer that contains the information about the + * layout of the stripe. The TreeWriter is required to update + * the footer with its information. + * @param requiredIndexEntries the number of index entries that are + * required. this is to check to make sure the + * row index is well formed. + * @throws IOException + */ + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + if (isPresent != null) { + isPresent.flush(); + + // if no nulls are found in a stream, then suppress the stream + if(!foundNulls) { + isPresentOutStream.suppress(); + // since isPresent bitstream is suppressed, update the index to + // remove the positions of the isPresent stream + if (rowIndexStream != null) { + removeIsPresentPositions(); + } + } + } + + // merge stripe-level column statistics to file statistics and write it to + // stripe statistics + OrcProto.StripeStatistics.Builder stripeStatsBuilder = OrcProto.StripeStatistics.newBuilder(); + writeStripeStatistics(stripeStatsBuilder, this); + stripeStatsBuilders.add(stripeStatsBuilder); + + // reset the flag for next stripe + foundNulls = false; + + builder.addColumns(getEncoding()); + builder.setWriterTimezone(TimeZone.getDefault().getID()); + if (rowIndexStream != null) { + if (rowIndex.getEntryCount() != requiredIndexEntries) { + throw new IllegalArgumentException("Column has wrong number of " + + "index entries found: " + rowIndex.getEntryCount() + " expected: " + + requiredIndexEntries); + } + rowIndex.build().writeTo(rowIndexStream); + rowIndexStream.flush(); + } + rowIndex.clear(); + rowIndexEntry.clear(); + + // write the bloom filter to out stream + if (bloomFilterStream != null) { + bloomFilterIndex.build().writeTo(bloomFilterStream); + bloomFilterStream.flush(); + bloomFilterIndex.clear(); + bloomFilterEntry.clear(); + } + } + + private void writeStripeStatistics(OrcProto.StripeStatistics.Builder builder, + TreeWriter treeWriter) { + treeWriter.fileStatistics.merge(treeWriter.stripeColStatistics); + builder.addColStats(treeWriter.stripeColStatistics.serialize().build()); + treeWriter.stripeColStatistics.reset(); + for (TreeWriter child : treeWriter.getChildrenWriters()) { + writeStripeStatistics(builder, child); + } + } + + TreeWriter[] getChildrenWriters() { + return childrenWriters; + } + + /** + * Get the encoding for this column. + * @return the information about the encoding of this column + */ + OrcProto.ColumnEncoding getEncoding() { + return OrcProto.ColumnEncoding.newBuilder().setKind( + OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } + + /** + * Create a row index entry with the previous location and the current + * index statistics. Also merges the index statistics into the file + * statistics before they are cleared. Finally, it records the start of the + * next index and ensures all of the children columns also create an entry. + * @throws IOException + */ + void createRowIndexEntry() throws IOException { + stripeColStatistics.merge(indexStatistics); + rowIndexEntry.setStatistics(indexStatistics.serialize()); + indexStatistics.reset(); + rowIndex.addEntry(rowIndexEntry); + rowIndexEntry.clear(); + addBloomFilterEntry(); + recordPosition(rowIndexPosition); + for(TreeWriter child: childrenWriters) { + child.createRowIndexEntry(); + } + } + + void addBloomFilterEntry() { + if (createBloomFilter) { + bloomFilterEntry.setNumHashFunctions(bloomFilter.getNumHashFunctions()); + bloomFilterEntry.addAllBitset(Longs.asList(bloomFilter.getBitSet())); + bloomFilterIndex.addBloomFilter(bloomFilterEntry.build()); + bloomFilter.reset(); + bloomFilterEntry.clear(); + } + } + + /** + * Record the current position in each of this column's streams. + * @param recorder where should the locations be recorded + * @throws IOException + */ + void recordPosition(PositionRecorder recorder) throws IOException { + if (isPresent != null) { + isPresent.getPosition(recorder); + } + } + + /** + * Estimate how much memory the writer is consuming excluding the streams. + * @return the number of bytes. + */ + long estimateMemory() { + long result = 0; + for (TreeWriter child: childrenWriters) { + result += child.estimateMemory(); + } + return result; + } + } + + private static class BooleanTreeWriter extends TreeWriter { + private final BitFieldWriter writer; + + BooleanTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + PositionedOutputStream out = writer.createStream(id, + OrcProto.Stream.Kind.DATA); + this.writer = new BitFieldWriter(out, 1); + recordPosition(rowIndexPosition); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + boolean val = ((BooleanObjectInspector) inspector).get(obj); + indexStatistics.updateBoolean(val); + writer.write(val ? 1 : 0); + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + writer.flush(); + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + writer.getPosition(recorder); + } + } + + private static class ByteTreeWriter extends TreeWriter { + private final RunLengthByteWriter writer; + + ByteTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + this.writer = new RunLengthByteWriter(writer.createStream(id, + OrcProto.Stream.Kind.DATA)); + recordPosition(rowIndexPosition); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + byte val = ((ByteObjectInspector) inspector).get(obj); + indexStatistics.updateInteger(val); + if (createBloomFilter) { + bloomFilter.addLong(val); + } + writer.write(val); + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + writer.flush(); + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + writer.getPosition(recorder); + } + } + + private static class IntegerTreeWriter extends TreeWriter { + private final IntegerWriter writer; + private final ShortObjectInspector shortInspector; + private final IntObjectInspector intInspector; + private final LongObjectInspector longInspector; + private boolean isDirectV2 = true; + + IntegerTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + OutStream out = writer.createStream(id, + OrcProto.Stream.Kind.DATA); + this.isDirectV2 = isNewWriteFormat(writer); + this.writer = createIntegerWriter(out, true, isDirectV2, writer); + if (inspector instanceof IntObjectInspector) { + intInspector = (IntObjectInspector) inspector; + shortInspector = null; + longInspector = null; + } else { + intInspector = null; + if (inspector instanceof LongObjectInspector) { + longInspector = (LongObjectInspector) inspector; + shortInspector = null; + } else { + shortInspector = (ShortObjectInspector) inspector; + longInspector = null; + } + } + recordPosition(rowIndexPosition); + } + + @Override + OrcProto.ColumnEncoding getEncoding() { + if (isDirectV2) { + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + } + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + long val; + if (intInspector != null) { + val = intInspector.get(obj); + } else if (longInspector != null) { + val = longInspector.get(obj); + } else { + val = shortInspector.get(obj); + } + indexStatistics.updateInteger(val); + if (createBloomFilter) { + // integers are converted to longs in column statistics and during SARG evaluation + bloomFilter.addLong(val); + } + writer.write(val); + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + writer.flush(); + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + writer.getPosition(recorder); + } + } + + private static class FloatTreeWriter extends TreeWriter { + private final PositionedOutputStream stream; + private final SerializationUtils utils; + + FloatTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + this.stream = writer.createStream(id, + OrcProto.Stream.Kind.DATA); + this.utils = new SerializationUtils(); + recordPosition(rowIndexPosition); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + float val = ((FloatObjectInspector) inspector).get(obj); + indexStatistics.updateDouble(val); + if (createBloomFilter) { + // floats are converted to doubles in column statistics and during SARG evaluation + bloomFilter.addDouble(val); + } + utils.writeFloat(stream, val); + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + stream.flush(); + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + stream.getPosition(recorder); + } + } + + private static class DoubleTreeWriter extends TreeWriter { + private final PositionedOutputStream stream; + private final SerializationUtils utils; + + DoubleTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + this.stream = writer.createStream(id, + OrcProto.Stream.Kind.DATA); + this.utils = new SerializationUtils(); + recordPosition(rowIndexPosition); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + double val = ((DoubleObjectInspector) inspector).get(obj); + indexStatistics.updateDouble(val); + if (createBloomFilter) { + bloomFilter.addDouble(val); + } + utils.writeDouble(stream, val); + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + stream.flush(); + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + stream.getPosition(recorder); + } + } + + private static class StringTreeWriter extends TreeWriter { + private static final int INITIAL_DICTIONARY_SIZE = 4096; + private final OutStream stringOutput; + private final IntegerWriter lengthOutput; + private final IntegerWriter rowOutput; + private final StringRedBlackTree dictionary = + new StringRedBlackTree(INITIAL_DICTIONARY_SIZE); + private final DynamicIntArray rows = new DynamicIntArray(); + private final PositionedOutputStream directStreamOutput; + private final IntegerWriter directLengthOutput; + private final List savedRowIndex = + new ArrayList(); + private final boolean buildIndex; + private final List rowIndexValueCount = new ArrayList(); + // If the number of keys in a dictionary is greater than this fraction of + //the total number of non-null rows, turn off dictionary encoding + private final float dictionaryKeySizeThreshold; + private boolean useDictionaryEncoding = true; + private boolean isDirectV2 = true; + private boolean doneDictionaryCheck; + private final boolean strideDictionaryCheck; + + StringTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + this.isDirectV2 = isNewWriteFormat(writer); + stringOutput = writer.createStream(id, + OrcProto.Stream.Kind.DICTIONARY_DATA); + lengthOutput = createIntegerWriter(writer.createStream(id, + OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); + rowOutput = createIntegerWriter(writer.createStream(id, + OrcProto.Stream.Kind.DATA), false, isDirectV2, writer); + recordPosition(rowIndexPosition); + rowIndexValueCount.add(0L); + buildIndex = writer.buildIndex(); + directStreamOutput = writer.createStream(id, OrcProto.Stream.Kind.DATA); + directLengthOutput = createIntegerWriter(writer.createStream(id, + OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); + dictionaryKeySizeThreshold = writer.getConfiguration().getFloat( + OrcConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.varname, + OrcConf.ConfVars.HIVE_ORC_DICTIONARY_KEY_SIZE_THRESHOLD.defaultFloatVal); + strideDictionaryCheck = writer.getConfiguration().getBoolean( + OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.varname, + OrcConf.ConfVars.HIVE_ORC_ROW_INDEX_STRIDE_DICTIONARY_CHECK.defaultBoolVal); + doneDictionaryCheck = false; + } + + /** + * Method to retrieve text values from the value object, which can be overridden + * by subclasses. + * @param obj value + * @return Text text value from obj + */ + Text getTextValue(Object obj) { + return ((StringObjectInspector) inspector).getPrimitiveWritableObject(obj); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + Text val = getTextValue(obj); + if (useDictionaryEncoding || !strideDictionaryCheck) { + rows.add(dictionary.add(val)); + } else { + // write data and length + directStreamOutput.write(val.getBytes(), 0, val.getLength()); + directLengthOutput.write(val.getLength()); + } + indexStatistics.updateString(val); + if (createBloomFilter) { + bloomFilter.addBytes(val.getBytes(), val.getLength()); + } + } + } + + private boolean checkDictionaryEncoding() { + if (!doneDictionaryCheck) { + // Set the flag indicating whether or not to use dictionary encoding + // based on whether or not the fraction of distinct keys over number of + // non-null rows is less than the configured threshold + float ratio = rows.size() > 0 ? (float) (dictionary.size()) / rows.size() : 0.0f; + useDictionaryEncoding = !isDirectV2 || ratio <= dictionaryKeySizeThreshold; + doneDictionaryCheck = true; + } + return useDictionaryEncoding; + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + // if rows in stripe is less than dictionaryCheckAfterRows, dictionary + // checking would not have happened. So do it again here. + checkDictionaryEncoding(); + + if (useDictionaryEncoding) { + flushDictionary(); + } else { + // flushout any left over entries from dictionary + if (rows.size() > 0) { + flushDictionary(); + } + + // suppress the stream for every stripe if dictionary is disabled + stringOutput.suppress(); + } + + // we need to build the rowindex before calling super, since it + // writes it out. + super.writeStripe(builder, requiredIndexEntries); + stringOutput.flush(); + lengthOutput.flush(); + rowOutput.flush(); + directStreamOutput.flush(); + directLengthOutput.flush(); + // reset all of the fields to be ready for the next stripe. + dictionary.clear(); + savedRowIndex.clear(); + rowIndexValueCount.clear(); + recordPosition(rowIndexPosition); + rowIndexValueCount.add(0L); + + if (!useDictionaryEncoding) { + // record the start positions of first index stride of next stripe i.e + // beginning of the direct streams when dictionary is disabled + recordDirectStreamPosition(); + } + } + + private void flushDictionary() throws IOException { + final int[] dumpOrder = new int[dictionary.size()]; + + if (useDictionaryEncoding) { + // Write the dictionary by traversing the red-black tree writing out + // the bytes and lengths; and creating the map from the original order + // to the final sorted order. + + dictionary.visit(new StringRedBlackTree.Visitor() { + private int currentId = 0; + @Override + public void visit(StringRedBlackTree.VisitorContext context + ) throws IOException { + context.writeBytes(stringOutput); + lengthOutput.write(context.getLength()); + dumpOrder[context.getOriginalPosition()] = currentId++; + } + }); + } else { + // for direct encoding, we don't want the dictionary data stream + stringOutput.suppress(); + } + int length = rows.size(); + int rowIndexEntry = 0; + OrcProto.RowIndex.Builder rowIndex = getRowIndex(); + Text text = new Text(); + // write the values translated into the dump order. + for(int i = 0; i <= length; ++i) { + // now that we are writing out the row values, we can finalize the + // row index + if (buildIndex) { + while (i == rowIndexValueCount.get(rowIndexEntry) && + rowIndexEntry < savedRowIndex.size()) { + OrcProto.RowIndexEntry.Builder base = + savedRowIndex.get(rowIndexEntry++).toBuilder(); + if (useDictionaryEncoding) { + rowOutput.getPosition(new RowIndexPositionRecorder(base)); + } else { + PositionRecorder posn = new RowIndexPositionRecorder(base); + directStreamOutput.getPosition(posn); + directLengthOutput.getPosition(posn); + } + rowIndex.addEntry(base.build()); + } + } + if (i != length) { + if (useDictionaryEncoding) { + rowOutput.write(dumpOrder[rows.get(i)]); + } else { + dictionary.getText(text, rows.get(i)); + directStreamOutput.write(text.getBytes(), 0, text.getLength()); + directLengthOutput.write(text.getLength()); + } + } + } + rows.clear(); + } + + @Override + OrcProto.ColumnEncoding getEncoding() { + // Returns the encoding used for the last call to writeStripe + if (useDictionaryEncoding) { + if(isDirectV2) { + return OrcProto.ColumnEncoding.newBuilder().setKind( + OrcProto.ColumnEncoding.Kind.DICTIONARY_V2). + setDictionarySize(dictionary.size()).build(); + } + return OrcProto.ColumnEncoding.newBuilder().setKind( + OrcProto.ColumnEncoding.Kind.DICTIONARY). + setDictionarySize(dictionary.size()).build(); + } else { + if(isDirectV2) { + return OrcProto.ColumnEncoding.newBuilder().setKind( + OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + } + return OrcProto.ColumnEncoding.newBuilder().setKind( + OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } + } + + /** + * This method doesn't call the super method, because unlike most of the + * other TreeWriters, this one can't record the position in the streams + * until the stripe is being flushed. Therefore it saves all of the entries + * and augments them with the final information as the stripe is written. + * @throws IOException + */ + @Override + void createRowIndexEntry() throws IOException { + getStripeStatistics().merge(indexStatistics); + OrcProto.RowIndexEntry.Builder rowIndexEntry = getRowIndexEntry(); + rowIndexEntry.setStatistics(indexStatistics.serialize()); + indexStatistics.reset(); + OrcProto.RowIndexEntry base = rowIndexEntry.build(); + savedRowIndex.add(base); + rowIndexEntry.clear(); + addBloomFilterEntry(); + recordPosition(rowIndexPosition); + rowIndexValueCount.add(Long.valueOf(rows.size())); + if (strideDictionaryCheck) { + checkDictionaryEncoding(); + } + if (!useDictionaryEncoding) { + if (rows.size() > 0) { + flushDictionary(); + // just record the start positions of next index stride + recordDirectStreamPosition(); + } else { + // record the start positions of next index stride + recordDirectStreamPosition(); + getRowIndex().addEntry(base); + } + } + } + + private void recordDirectStreamPosition() throws IOException { + directStreamOutput.getPosition(rowIndexPosition); + directLengthOutput.getPosition(rowIndexPosition); + } + + @Override + long estimateMemory() { + return rows.getSizeInBytes() + dictionary.getSizeInBytes(); + } + } + + /** + * Under the covers, char is written to ORC the same way as string. + */ + private static class CharTreeWriter extends StringTreeWriter { + + CharTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + } + + /** + * Override base class implementation to support char values. + */ + @Override + Text getTextValue(Object obj) { + return (((HiveCharObjectInspector) inspector) + .getPrimitiveWritableObject(obj)).getTextValue(); + } + } + + /** + * Under the covers, varchar is written to ORC the same way as string. + */ + private static class VarcharTreeWriter extends StringTreeWriter { + + VarcharTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + } + + /** + * Override base class implementation to support varchar values. + */ + @Override + Text getTextValue(Object obj) { + return (((HiveVarcharObjectInspector) inspector) + .getPrimitiveWritableObject(obj)).getTextValue(); + } + } + + private static class BinaryTreeWriter extends TreeWriter { + private final PositionedOutputStream stream; + private final IntegerWriter length; + private boolean isDirectV2 = true; + + BinaryTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + this.stream = writer.createStream(id, + OrcProto.Stream.Kind.DATA); + this.isDirectV2 = isNewWriteFormat(writer); + this.length = createIntegerWriter(writer.createStream(id, + OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); + recordPosition(rowIndexPosition); + } + + @Override + OrcProto.ColumnEncoding getEncoding() { + if (isDirectV2) { + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + } + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + BytesWritable val = + ((BinaryObjectInspector) inspector).getPrimitiveWritableObject(obj); + stream.write(val.getBytes(), 0, val.getLength()); + length.write(val.getLength()); + indexStatistics.updateBinary(val); + if (createBloomFilter) { + bloomFilter.addBytes(val.getBytes(), val.getLength()); + } + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + stream.flush(); + length.flush(); + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + stream.getPosition(recorder); + length.getPosition(recorder); + } + } + + static final int MILLIS_PER_SECOND = 1000; + static final String BASE_TIMESTAMP_STRING = "2015-01-01 00:00:00"; + + private static class TimestampTreeWriter extends TreeWriter { + private final IntegerWriter seconds; + private final IntegerWriter nanos; + private final boolean isDirectV2; + private final long base_timestamp; + + TimestampTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + this.isDirectV2 = isNewWriteFormat(writer); + this.seconds = createIntegerWriter(writer.createStream(id, + OrcProto.Stream.Kind.DATA), true, isDirectV2, writer); + this.nanos = createIntegerWriter(writer.createStream(id, + OrcProto.Stream.Kind.SECONDARY), false, isDirectV2, writer); + recordPosition(rowIndexPosition); + // for unit tests to set different time zones + this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / MILLIS_PER_SECOND; + } + + @Override + OrcProto.ColumnEncoding getEncoding() { + if (isDirectV2) { + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + } + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + Timestamp val = + ((TimestampObjectInspector) inspector). + getPrimitiveJavaObject(obj); + indexStatistics.updateTimestamp(val); + seconds.write((val.getTime() / MILLIS_PER_SECOND) - base_timestamp); + nanos.write(formatNanos(val.getNanos())); + if (createBloomFilter) { + bloomFilter.addLong(val.getTime()); + } + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + seconds.flush(); + nanos.flush(); + recordPosition(rowIndexPosition); + } + + private static long formatNanos(int nanos) { + if (nanos == 0) { + return 0; + } else if (nanos % 100 != 0) { + return ((long) nanos) << 3; + } else { + nanos /= 100; + int trailingZeros = 1; + while (nanos % 10 == 0 && trailingZeros < 7) { + nanos /= 10; + trailingZeros += 1; + } + return ((long) nanos) << 3 | trailingZeros; + } + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + seconds.getPosition(recorder); + nanos.getPosition(recorder); + } + } + + private static class DateTreeWriter extends TreeWriter { + private final IntegerWriter writer; + private final boolean isDirectV2; + + DateTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + OutStream out = writer.createStream(id, + OrcProto.Stream.Kind.DATA); + this.isDirectV2 = isNewWriteFormat(writer); + this.writer = createIntegerWriter(out, true, isDirectV2, writer); + recordPosition(rowIndexPosition); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + // Using the Writable here as it's used directly for writing as well as for stats. + DateWritable val = ((DateObjectInspector) inspector).getPrimitiveWritableObject(obj); + indexStatistics.updateDate(val); + writer.write(val.getDays()); + if (createBloomFilter) { + bloomFilter.addLong(val.getDays()); + } + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + writer.flush(); + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + writer.getPosition(recorder); + } + + @Override + OrcProto.ColumnEncoding getEncoding() { + if (isDirectV2) { + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + } + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } + } + + private static class DecimalTreeWriter extends TreeWriter { + private final PositionedOutputStream valueStream; + private final IntegerWriter scaleStream; + private final boolean isDirectV2; + + DecimalTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + this.isDirectV2 = isNewWriteFormat(writer); + valueStream = writer.createStream(id, OrcProto.Stream.Kind.DATA); + this.scaleStream = createIntegerWriter(writer.createStream(id, + OrcProto.Stream.Kind.SECONDARY), true, isDirectV2, writer); + recordPosition(rowIndexPosition); + } + + @Override + OrcProto.ColumnEncoding getEncoding() { + if (isDirectV2) { + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + } + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + HiveDecimal decimal = ((HiveDecimalObjectInspector) inspector). + getPrimitiveJavaObject(obj); + if (decimal == null) { + return; + } + SerializationUtils.writeBigInteger(valueStream, + decimal.unscaledValue()); + scaleStream.write(decimal.scale()); + indexStatistics.updateDecimal(decimal); + if (createBloomFilter) { + bloomFilter.addString(decimal.toString()); + } + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + valueStream.flush(); + scaleStream.flush(); + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + valueStream.getPosition(recorder); + scaleStream.getPosition(recorder); + } + } + + private static class StructTreeWriter extends TreeWriter { + private final List fields; + StructTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + StructObjectInspector structObjectInspector = + (StructObjectInspector) inspector; + fields = structObjectInspector.getAllStructFieldRefs(); + childrenWriters = new TreeWriter[fields.size()]; + for(int i=0; i < childrenWriters.length; ++i) { + childrenWriters[i] = createTreeWriter( + fields.get(i).getFieldObjectInspector(), writer, true); + } + recordPosition(rowIndexPosition); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + StructObjectInspector insp = (StructObjectInspector) inspector; + for(int i = 0; i < fields.size(); ++i) { + StructField field = fields.get(i); + TreeWriter writer = childrenWriters[i]; + writer.write(insp.getStructFieldData(obj, field)); + } + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + for(TreeWriter child: childrenWriters) { + child.writeStripe(builder, requiredIndexEntries); + } + recordPosition(rowIndexPosition); + } + } + + private static class ListTreeWriter extends TreeWriter { + private final IntegerWriter lengths; + private final boolean isDirectV2; + + ListTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + this.isDirectV2 = isNewWriteFormat(writer); + ListObjectInspector listObjectInspector = (ListObjectInspector) inspector; + childrenWriters = new TreeWriter[1]; + childrenWriters[0] = + createTreeWriter(listObjectInspector.getListElementObjectInspector(), + writer, true); + lengths = createIntegerWriter(writer.createStream(columnId, + OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); + recordPosition(rowIndexPosition); + } + + @Override + OrcProto.ColumnEncoding getEncoding() { + if (isDirectV2) { + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + } + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + ListObjectInspector insp = (ListObjectInspector) inspector; + int len = insp.getListLength(obj); + lengths.write(len); + if (createBloomFilter) { + bloomFilter.addLong(len); + } + for(int i=0; i < len; ++i) { + childrenWriters[0].write(insp.getListElement(obj, i)); + } + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + lengths.flush(); + for(TreeWriter child: childrenWriters) { + child.writeStripe(builder, requiredIndexEntries); + } + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + lengths.getPosition(recorder); + } + } + + private static class MapTreeWriter extends TreeWriter { + private final IntegerWriter lengths; + private final boolean isDirectV2; + + MapTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + this.isDirectV2 = isNewWriteFormat(writer); + MapObjectInspector insp = (MapObjectInspector) inspector; + childrenWriters = new TreeWriter[2]; + childrenWriters[0] = + createTreeWriter(insp.getMapKeyObjectInspector(), writer, true); + childrenWriters[1] = + createTreeWriter(insp.getMapValueObjectInspector(), writer, true); + lengths = createIntegerWriter(writer.createStream(columnId, + OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); + recordPosition(rowIndexPosition); + } + + @Override + OrcProto.ColumnEncoding getEncoding() { + if (isDirectV2) { + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + } + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + MapObjectInspector insp = (MapObjectInspector) inspector; + // this sucks, but it will have to do until we can get a better + // accessor in the MapObjectInspector. + Map valueMap = insp.getMap(obj); + lengths.write(valueMap.size()); + if (createBloomFilter) { + bloomFilter.addLong(valueMap.size()); + } + for(Map.Entry entry: valueMap.entrySet()) { + childrenWriters[0].write(entry.getKey()); + childrenWriters[1].write(entry.getValue()); + } + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + lengths.flush(); + for(TreeWriter child: childrenWriters) { + child.writeStripe(builder, requiredIndexEntries); + } + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + lengths.getPosition(recorder); + } + } + + private static class UnionTreeWriter extends TreeWriter { + private final RunLengthByteWriter tags; + + UnionTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + UnionObjectInspector insp = (UnionObjectInspector) inspector; + List choices = insp.getObjectInspectors(); + childrenWriters = new TreeWriter[choices.size()]; + for(int i=0; i < childrenWriters.length; ++i) { + childrenWriters[i] = createTreeWriter(choices.get(i), writer, true); + } + tags = + new RunLengthByteWriter(writer.createStream(columnId, + OrcProto.Stream.Kind.DATA)); + recordPosition(rowIndexPosition); + } + + @Override + void write(Object obj) throws IOException { + super.write(obj); + if (obj != null) { + UnionObjectInspector insp = (UnionObjectInspector) inspector; + byte tag = insp.getTag(obj); + tags.write(tag); + if (createBloomFilter) { + bloomFilter.addLong(tag); + } + childrenWriters[tag].write(insp.getField(obj)); + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + tags.flush(); + for(TreeWriter child: childrenWriters) { + child.writeStripe(builder, requiredIndexEntries); + } + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + tags.getPosition(recorder); + } + } + + private static TreeWriter createTreeWriter(ObjectInspector inspector, + StreamFactory streamFactory, + boolean nullable) throws IOException { + switch (inspector.getCategory()) { + case PRIMITIVE: + switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) { + case BOOLEAN: + return new BooleanTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + case BYTE: + return new ByteTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + case SHORT: + case INT: + case LONG: + return new IntegerTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + case FLOAT: + return new FloatTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + case DOUBLE: + return new DoubleTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + case STRING: + return new StringTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + case CHAR: + return new CharTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + case VARCHAR: + return new VarcharTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + case BINARY: + return new BinaryTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + case TIMESTAMP: + return new TimestampTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + case DATE: + return new DateTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + case DECIMAL: + return new DecimalTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); + default: + throw new IllegalArgumentException("Bad primitive category " + + ((PrimitiveObjectInspector) inspector).getPrimitiveCategory()); + } + case STRUCT: + return new StructTreeWriter(streamFactory.getNextColumnId(), inspector, + streamFactory, nullable); + case MAP: + return new MapTreeWriter(streamFactory.getNextColumnId(), inspector, + streamFactory, nullable); + case LIST: + return new ListTreeWriter(streamFactory.getNextColumnId(), inspector, + streamFactory, nullable); + case UNION: + return new UnionTreeWriter(streamFactory.getNextColumnId(), inspector, + streamFactory, nullable); + default: + throw new IllegalArgumentException("Bad category: " + + inspector.getCategory()); + } + } + + private static void writeTypes(OrcProto.Footer.Builder builder, + TreeWriter treeWriter) { + OrcProto.Type.Builder type = OrcProto.Type.newBuilder(); + switch (treeWriter.inspector.getCategory()) { + case PRIMITIVE: + switch (((PrimitiveObjectInspector) treeWriter.inspector). + getPrimitiveCategory()) { + case BOOLEAN: + type.setKind(OrcProto.Type.Kind.BOOLEAN); + break; + case BYTE: + type.setKind(OrcProto.Type.Kind.BYTE); + break; + case SHORT: + type.setKind(OrcProto.Type.Kind.SHORT); + break; + case INT: + type.setKind(OrcProto.Type.Kind.INT); + break; + case LONG: + type.setKind(OrcProto.Type.Kind.LONG); + break; + case FLOAT: + type.setKind(OrcProto.Type.Kind.FLOAT); + break; + case DOUBLE: + type.setKind(OrcProto.Type.Kind.DOUBLE); + break; + case STRING: + type.setKind(OrcProto.Type.Kind.STRING); + break; + case CHAR: + // The char length needs to be written to file and should be available + // from the object inspector + CharTypeInfo charTypeInfo = (CharTypeInfo) ((PrimitiveObjectInspector) treeWriter.inspector).getTypeInfo(); + type.setKind(Type.Kind.CHAR); + type.setMaximumLength(charTypeInfo.getLength()); + break; + case VARCHAR: + // The varchar length needs to be written to file and should be available + // from the object inspector + VarcharTypeInfo typeInfo = (VarcharTypeInfo) ((PrimitiveObjectInspector) treeWriter.inspector).getTypeInfo(); + type.setKind(Type.Kind.VARCHAR); + type.setMaximumLength(typeInfo.getLength()); + break; + case BINARY: + type.setKind(OrcProto.Type.Kind.BINARY); + break; + case TIMESTAMP: + type.setKind(OrcProto.Type.Kind.TIMESTAMP); + break; + case DATE: + type.setKind(OrcProto.Type.Kind.DATE); + break; + case DECIMAL: + DecimalTypeInfo decTypeInfo = (DecimalTypeInfo)((PrimitiveObjectInspector)treeWriter.inspector).getTypeInfo(); + type.setKind(OrcProto.Type.Kind.DECIMAL); + type.setPrecision(decTypeInfo.precision()); + type.setScale(decTypeInfo.scale()); + break; + default: + throw new IllegalArgumentException("Unknown primitive category: " + + ((PrimitiveObjectInspector) treeWriter.inspector). + getPrimitiveCategory()); + } + break; + case LIST: + type.setKind(OrcProto.Type.Kind.LIST); + type.addSubtypes(treeWriter.childrenWriters[0].id); + break; + case MAP: + type.setKind(OrcProto.Type.Kind.MAP); + type.addSubtypes(treeWriter.childrenWriters[0].id); + type.addSubtypes(treeWriter.childrenWriters[1].id); + break; + case STRUCT: + type.setKind(OrcProto.Type.Kind.STRUCT); + for(TreeWriter child: treeWriter.childrenWriters) { + type.addSubtypes(child.id); + } + for(StructField field: ((StructTreeWriter) treeWriter).fields) { + type.addFieldNames(field.getFieldName()); + } + break; + case UNION: + type.setKind(OrcProto.Type.Kind.UNION); + for(TreeWriter child: treeWriter.childrenWriters) { + type.addSubtypes(child.id); + } + break; + default: + throw new IllegalArgumentException("Unknown category: " + + treeWriter.inspector.getCategory()); + } + builder.addTypes(type); + for(TreeWriter child: treeWriter.childrenWriters) { + writeTypes(builder, child); + } + } + + @VisibleForTesting + FSDataOutputStream getStream() throws IOException { + if (rawWriter == null) { + rawWriter = fs.create(path, false, HDFS_BUFFER_SIZE, + fs.getDefaultReplication(path), blockSize); + rawWriter.writeBytes(OrcFile.MAGIC); + headerLength = rawWriter.getPos(); + writer = new OutStream("metadata", bufferSize, codec, + new DirectStream(rawWriter)); + protobufWriter = CodedOutputStream.newInstance(writer); + } + return rawWriter; + } + + private void createRowIndexEntry() throws IOException { + treeWriter.createRowIndexEntry(); + rowsInIndex = 0; + } + + private void flushStripe() throws IOException { + getStream(); + if (buildIndex && rowsInIndex != 0) { + createRowIndexEntry(); + } + if (rowsInStripe != 0) { + if (callback != null) { + callback.preStripeWrite(callbackContext); + } + // finalize the data for the stripe + int requiredIndexEntries = rowIndexStride == 0 ? 0 : + (int) ((rowsInStripe + rowIndexStride - 1) / rowIndexStride); + OrcProto.StripeFooter.Builder builder = + OrcProto.StripeFooter.newBuilder(); + treeWriter.writeStripe(builder, requiredIndexEntries); + long indexSize = 0; + long dataSize = 0; + for(Map.Entry pair: streams.entrySet()) { + BufferedStream stream = pair.getValue(); + if (!stream.isSuppressed()) { + stream.flush(); + StreamName name = pair.getKey(); + long streamSize = pair.getValue().getOutputSize(); + builder.addStreams(OrcProto.Stream.newBuilder() + .setColumn(name.getColumn()) + .setKind(name.getKind()) + .setLength(streamSize)); + if (StreamName.Area.INDEX == name.getArea()) { + indexSize += streamSize; + } else { + dataSize += streamSize; + } + } + } + OrcProto.StripeFooter footer = builder.build(); + + // Do we need to pad the file so the stripe doesn't straddle a block + // boundary? + long start = rawWriter.getPos(); + final long currentStripeSize = indexSize + dataSize + footer.getSerializedSize(); + final long available = blockSize - (start % blockSize); + final long overflow = currentStripeSize - adjustedStripeSize; + final float availRatio = (float) available / (float) defaultStripeSize; + + if (availRatio > 0.0f && availRatio < 1.0f + && availRatio > paddingTolerance) { + // adjust default stripe size to fit into remaining space, also adjust + // the next stripe for correction based on the current stripe size + // and user specified padding tolerance. Since stripe size can overflow + // the default stripe size we should apply this correction to avoid + // writing portion of last stripe to next hdfs block. + float correction = overflow > 0 ? (float) overflow + / (float) adjustedStripeSize : 0.0f; + + // correction should not be greater than user specified padding + // tolerance + correction = correction > paddingTolerance ? paddingTolerance + : correction; + + // adjust next stripe size based on current stripe estimate correction + adjustedStripeSize = (long) ((1.0f - correction) * (availRatio * defaultStripeSize)); + } else if (availRatio >= 1.0) { + adjustedStripeSize = defaultStripeSize; + } + + if (availRatio < paddingTolerance && addBlockPadding) { + long padding = blockSize - (start % blockSize); + byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, padding)]; + LOG.info(String.format("Padding ORC by %d bytes (<= %.2f * %d)", + padding, availRatio, defaultStripeSize)); + start += padding; + while (padding > 0) { + int writeLen = (int) Math.min(padding, pad.length); + rawWriter.write(pad, 0, writeLen); + padding -= writeLen; + } + adjustedStripeSize = defaultStripeSize; + } else if (currentStripeSize < blockSize + && (start % blockSize) + currentStripeSize > blockSize) { + // even if you don't pad, reset the default stripe size when crossing a + // block boundary + adjustedStripeSize = defaultStripeSize; + } + + // write out the data streams + for(Map.Entry pair: streams.entrySet()) { + BufferedStream stream = pair.getValue(); + if (!stream.isSuppressed()) { + stream.spillTo(rawWriter); + } + stream.clear(); + } + footer.writeTo(protobufWriter); + protobufWriter.flush(); + writer.flush(); + long footerLength = rawWriter.getPos() - start - dataSize - indexSize; + OrcProto.StripeInformation dirEntry = + OrcProto.StripeInformation.newBuilder() + .setOffset(start) + .setNumberOfRows(rowsInStripe) + .setIndexLength(indexSize) + .setDataLength(dataSize) + .setFooterLength(footerLength).build(); + stripes.add(dirEntry); + rowCount += rowsInStripe; + rowsInStripe = 0; + } + } + + private long computeRawDataSize() { + long result = 0; + for (TreeWriter child : treeWriter.getChildrenWriters()) { + result += getRawDataSizeFromInspectors(child, child.inspector); + } + return result; + } + + private long getRawDataSizeFromInspectors(TreeWriter child, ObjectInspector oi) { + long total = 0; + switch (oi.getCategory()) { + case PRIMITIVE: + total += getRawDataSizeFromPrimitives(child, oi); + break; + case LIST: + case MAP: + case UNION: + case STRUCT: + for (TreeWriter tw : child.childrenWriters) { + total += getRawDataSizeFromInspectors(tw, tw.inspector); + } + break; + default: + LOG.debug("Unknown object inspector category."); + break; + } + return total; + } + + private long getRawDataSizeFromPrimitives(TreeWriter child, ObjectInspector oi) { + long result = 0; + long numVals = child.fileStatistics.getNumberOfValues(); + switch (((PrimitiveObjectInspector) oi).getPrimitiveCategory()) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case FLOAT: + return numVals * JavaDataModel.get().primitive1(); + case LONG: + case DOUBLE: + return numVals * JavaDataModel.get().primitive2(); + case STRING: + case VARCHAR: + case CHAR: + // ORC strings are converted to java Strings. so use JavaDataModel to + // compute the overall size of strings + child = (StringTreeWriter) child; + StringColumnStatistics scs = (StringColumnStatistics) child.fileStatistics; + numVals = numVals == 0 ? 1 : numVals; + int avgStringLen = (int) (scs.getSum() / numVals); + return numVals * JavaDataModel.get().lengthForStringOfLength(avgStringLen); + case DECIMAL: + return numVals * JavaDataModel.get().lengthOfDecimal(); + case DATE: + return numVals * JavaDataModel.get().lengthOfDate(); + case BINARY: + // get total length of binary blob + BinaryColumnStatistics bcs = (BinaryColumnStatistics) child.fileStatistics; + return bcs.getSum(); + case TIMESTAMP: + return numVals * JavaDataModel.get().lengthOfTimestamp(); + default: + LOG.debug("Unknown primitive category."); + break; + } + + return result; + } + + private OrcProto.CompressionKind writeCompressionKind(CompressionKind kind) { + switch (kind) { + case NONE: return OrcProto.CompressionKind.NONE; + case ZLIB: return OrcProto.CompressionKind.ZLIB; + case SNAPPY: return OrcProto.CompressionKind.SNAPPY; + case LZO: return OrcProto.CompressionKind.LZO; + default: + throw new IllegalArgumentException("Unknown compression " + kind); + } + } + + private void writeFileStatistics(OrcProto.Footer.Builder builder, + TreeWriter writer) throws IOException { + builder.addStatistics(writer.fileStatistics.serialize()); + for(TreeWriter child: writer.getChildrenWriters()) { + writeFileStatistics(builder, child); + } + } + + private int writeMetadata(long bodyLength) throws IOException { + getStream(); + OrcProto.Metadata.Builder builder = OrcProto.Metadata.newBuilder(); + for(OrcProto.StripeStatistics.Builder ssb : treeWriter.stripeStatsBuilders) { + builder.addStripeStats(ssb.build()); + } + + long startPosn = rawWriter.getPos(); + OrcProto.Metadata metadata = builder.build(); + metadata.writeTo(protobufWriter); + protobufWriter.flush(); + writer.flush(); + return (int) (rawWriter.getPos() - startPosn); + } + + private int writeFooter(long bodyLength) throws IOException { + getStream(); + OrcProto.Footer.Builder builder = OrcProto.Footer.newBuilder(); + builder.setContentLength(bodyLength); + builder.setHeaderLength(headerLength); + builder.setNumberOfRows(rowCount); + builder.setRowIndexStride(rowIndexStride); + // populate raw data size + rawDataSize = computeRawDataSize(); + // serialize the types + writeTypes(builder, treeWriter); + // add the stripe information + for(OrcProto.StripeInformation stripe: stripes) { + builder.addStripes(stripe); + } + // add the column statistics + writeFileStatistics(builder, treeWriter); + // add all of the user metadata + for(Map.Entry entry: userMetadata.entrySet()) { + builder.addMetadata(OrcProto.UserMetadataItem.newBuilder() + .setName(entry.getKey()).setValue(entry.getValue())); + } + long startPosn = rawWriter.getPos(); + OrcProto.Footer footer = builder.build(); + footer.writeTo(protobufWriter); + protobufWriter.flush(); + writer.flush(); + return (int) (rawWriter.getPos() - startPosn); + } + + private int writePostScript(int footerLength, int metadataLength) throws IOException { + OrcProto.PostScript.Builder builder = + OrcProto.PostScript.newBuilder() + .setCompression(writeCompressionKind(compress)) + .setFooterLength(footerLength) + .setMetadataLength(metadataLength) + .setMagic(OrcFile.MAGIC) + .addVersion(version.getMajor()) + .addVersion(version.getMinor()) + .setWriterVersion(OrcFile.WriterVersion.HIVE_8732.getId()); + if (compress != CompressionKind.NONE) { + builder.setCompressionBlockSize(bufferSize); + } + OrcProto.PostScript ps = builder.build(); + // need to write this uncompressed + long startPosn = rawWriter.getPos(); + ps.writeTo(rawWriter); + long length = rawWriter.getPos() - startPosn; + if (length > 255) { + throw new IllegalArgumentException("PostScript too large at " + length); + } + return (int) length; + } + + private long estimateStripeSize() { + long result = 0; + for(BufferedStream stream: streams.values()) { + result += stream.getBufferSize(); + } + result += treeWriter.estimateMemory(); + return result; + } + + @Override + public synchronized void addUserMetadata(String name, ByteBuffer value) { + userMetadata.put(name, ByteString.copyFrom(value)); + } + + @Override + public void addRow(Object row) throws IOException { + synchronized (this) { + treeWriter.write(row); + rowsInStripe += 1; + if (buildIndex) { + rowsInIndex += 1; + + if (rowsInIndex >= rowIndexStride) { + createRowIndexEntry(); + } + } + } + memoryManager.addedRow(); + } + + @Override + public void close() throws IOException { + if (callback != null) { + callback.preFooterWrite(callbackContext); + } + // remove us from the memory manager so that we don't get any callbacks + memoryManager.removeWriter(path); + // actually close the file + synchronized (this) { + flushStripe(); + int metadataLength = writeMetadata(rawWriter.getPos()); + int footerLength = writeFooter(rawWriter.getPos() - metadataLength); + rawWriter.writeByte(writePostScript(footerLength, metadataLength)); + rawWriter.close(); + } + } + + /** + * Raw data size will be compute when writing the file footer. Hence raw data + * size value will be available only after closing the writer. + */ + @Override + public long getRawDataSize() { + return rawDataSize; + } + + /** + * Row count gets updated when flushing the stripes. To get accurate row + * count call this method after writer is closed. + */ + @Override + public long getNumberOfRows() { + return rowCount; + } + + @Override + public synchronized long writeIntermediateFooter() throws IOException { + // flush any buffered rows + flushStripe(); + // write a footer + if (stripesAtLastFlush != stripes.size()) { + if (callback != null) { + callback.preFooterWrite(callbackContext); + } + int metaLength = writeMetadata(rawWriter.getPos()); + int footLength = writeFooter(rawWriter.getPos() - metaLength); + rawWriter.writeByte(writePostScript(footLength, metaLength)); + stripesAtLastFlush = stripes.size(); + ShimLoader.getHadoopShims().hflush(rawWriter); + } + return rawWriter.getPos(); + } + + @Override + public void appendStripe(byte[] stripe, int offset, int length, + StripeInformation stripeInfo, + OrcProto.StripeStatistics stripeStatistics) throws IOException { + checkArgument(stripe != null, "Stripe must not be null"); + checkArgument(length <= stripe.length, + "Specified length must not be greater specified array length"); + checkArgument(stripeInfo != null, "Stripe information must not be null"); + checkArgument(stripeStatistics != null, + "Stripe statistics must not be null"); + + getStream(); + long start = rawWriter.getPos(); + long stripeLen = length; + long availBlockSpace = blockSize - (start % blockSize); + + // see if stripe can fit in the current hdfs block, else pad the remaining + // space in the block + if (stripeLen < blockSize && stripeLen > availBlockSpace && + addBlockPadding) { + byte[] pad = new byte[(int) Math.min(HDFS_BUFFER_SIZE, availBlockSpace)]; + LOG.info(String.format("Padding ORC by %d bytes while merging..", + availBlockSpace)); + start += availBlockSpace; + while (availBlockSpace > 0) { + int writeLen = (int) Math.min(availBlockSpace, pad.length); + rawWriter.write(pad, 0, writeLen); + availBlockSpace -= writeLen; + } + } + + rawWriter.write(stripe); + rowsInStripe = stripeStatistics.getColStats(0).getNumberOfValues(); + rowCount += rowsInStripe; + + // since we have already written the stripe, just update stripe statistics + treeWriter.stripeStatsBuilders.add(stripeStatistics.toBuilder()); + + // update file level statistics + updateFileStatistics(stripeStatistics); + + // update stripe information + OrcProto.StripeInformation dirEntry = OrcProto.StripeInformation + .newBuilder() + .setOffset(start) + .setNumberOfRows(rowsInStripe) + .setIndexLength(stripeInfo.getIndexLength()) + .setDataLength(stripeInfo.getDataLength()) + .setFooterLength(stripeInfo.getFooterLength()) + .build(); + stripes.add(dirEntry); + + // reset it after writing the stripe + rowsInStripe = 0; + } + + private void updateFileStatistics(OrcProto.StripeStatistics stripeStatistics) { + List cs = stripeStatistics.getColStatsList(); + List allWriters = getAllColumnTreeWriters(treeWriter); + for (int i = 0; i < allWriters.size(); i++) { + allWriters.get(i).fileStatistics.merge(ColumnStatisticsImpl.deserialize(cs.get(i))); + } + } + + private List getAllColumnTreeWriters(TreeWriter rootTreeWriter) { + List result = Lists.newArrayList(); + getAllColumnTreeWritersImpl(rootTreeWriter, result); + return result; + } + + private void getAllColumnTreeWritersImpl(TreeWriter tw, + List result) { + result.add(tw); + for (TreeWriter child : tw.childrenWriters) { + getAllColumnTreeWritersImpl(child, result); + } + } + + @Override + public void appendUserMetadata(List userMetadata) { + if (userMetadata != null) { + for (UserMetadataItem item : userMetadata) { + this.userMetadata.put(item.getName(), item.getValue()); + } + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java new file mode 100644 index 0000000000..d0a8fa7da3 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ZlibCodec.java @@ -0,0 +1,169 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tajo.storage.thirdparty.orc; + +import org.apache.hadoop.hive.shims.HadoopShims.DirectCompressionType; +import org.apache.hadoop.hive.shims.HadoopShims.DirectDecompressorShim; +import org.apache.hadoop.hive.shims.ShimLoader; + +import javax.annotation.Nullable; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.EnumSet; +import java.util.zip.DataFormatException; +import java.util.zip.Deflater; +import java.util.zip.Inflater; + +class ZlibCodec implements CompressionCodec, DirectDecompressionCodec { + + private Boolean direct = null; + + private final int level; + private final int strategy; + + public ZlibCodec() { + level = Deflater.DEFAULT_COMPRESSION; + strategy = Deflater.DEFAULT_STRATEGY; + } + + private ZlibCodec(int level, int strategy) { + this.level = level; + this.strategy = strategy; + } + + @Override + public boolean compress(ByteBuffer in, ByteBuffer out, + ByteBuffer overflow) throws IOException { + Deflater deflater = new Deflater(level, true); + deflater.setStrategy(strategy); + int length = in.remaining(); + deflater.setInput(in.array(), in.arrayOffset() + in.position(), length); + deflater.finish(); + int outSize = 0; + int offset = out.arrayOffset() + out.position(); + while (!deflater.finished() && (length > outSize)) { + int size = deflater.deflate(out.array(), offset, out.remaining()); + out.position(size + out.position()); + outSize += size; + offset += size; + // if we run out of space in the out buffer, use the overflow + if (out.remaining() == 0) { + if (overflow == null) { + deflater.end(); + return false; + } + out = overflow; + offset = out.arrayOffset() + out.position(); + } + } + deflater.end(); + return length > outSize; + } + + @Override + public void decompress(ByteBuffer in, ByteBuffer out) throws IOException { + + if(in.isDirect() && out.isDirect()) { + directDecompress(in, out); + return; + } + + Inflater inflater = new Inflater(true); + inflater.setInput(in.array(), in.arrayOffset() + in.position(), + in.remaining()); + while (!(inflater.finished() || inflater.needsDictionary() || + inflater.needsInput())) { + try { + int count = inflater.inflate(out.array(), + out.arrayOffset() + out.position(), + out.remaining()); + out.position(count + out.position()); + } catch (DataFormatException dfe) { + throw new IOException("Bad compression data", dfe); + } + } + out.flip(); + inflater.end(); + in.position(in.limit()); + } + + @Override + public boolean isAvailable() { + if (direct == null) { + // see nowrap option in new Inflater(boolean) which disables zlib headers + try { + if (ShimLoader.getHadoopShims().getDirectDecompressor( + DirectCompressionType.ZLIB_NOHEADER) != null) { + direct = Boolean.valueOf(true); + } else { + direct = Boolean.valueOf(false); + } + } catch (UnsatisfiedLinkError ule) { + direct = Boolean.valueOf(false); + } + } + return direct.booleanValue(); + } + + @Override + public void directDecompress(ByteBuffer in, ByteBuffer out) + throws IOException { + DirectDecompressorShim decompressShim = ShimLoader.getHadoopShims() + .getDirectDecompressor(DirectCompressionType.ZLIB_NOHEADER); + decompressShim.decompress(in, out); + out.flip(); // flip for read + } + + @Override + public CompressionCodec modify(@Nullable EnumSet modifiers) { + + if (modifiers == null) { + return this; + } + + int l = this.level; + int s = this.strategy; + + for (Modifier m : modifiers) { + switch (m) { + case BINARY: + /* filtered == less LZ77, more huffman */ + s = Deflater.FILTERED; + break; + case TEXT: + s = Deflater.DEFAULT_STRATEGY; + break; + case FASTEST: + // deflate_fast looking for 8 byte patterns + l = Deflater.BEST_SPEED; + break; + case FAST: + // deflate_fast looking for 16 byte patterns + l = Deflater.BEST_SPEED + 1; + break; + case DEFAULT: + // deflate_slow looking for 128 byte patterns + l = Deflater.DEFAULT_COMPRESSION; + break; + default: + break; + } + } + return new ZlibCodec(l, s); + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/util/BloomFilter.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/util/BloomFilter.java new file mode 100644 index 0000000000..2bbd0cdc8e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/util/BloomFilter.java @@ -0,0 +1,291 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc.util; + +import java.util.Arrays; + +import static com.google.common.base.Preconditions.checkArgument; + +/** + * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are + * highly space efficient when compared to using a HashSet. Because of the probabilistic nature of + * bloom filter false positive (element not present in bloom filter but test() says true) are + * possible but false negatives are not possible (if element is present then test() will never + * say false). The false positive probability is configurable (default: 5%) depending on which + * storage requirement may increase or decrease. Lower the false positive probability greater + * is the space requirement. + * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter. + * During the creation of bloom filter expected number of entries must be specified. If the number + * of insertions exceed the specified initial number of entries then false positive probability will + * increase accordingly. + * + * Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash + * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash + * collisions for specific sequence of repeating bytes. Check the following link for more info + * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw + */ +public class BloomFilter { + public static final double DEFAULT_FPP = 0.05; + protected BitSet bitSet; + protected int numBits; + protected int numHashFunctions; + + public BloomFilter() { + } + + public BloomFilter(long expectedEntries) { + this(expectedEntries, DEFAULT_FPP); + } + + public BloomFilter(long expectedEntries, double fpp) { + checkArgument(expectedEntries > 0, "expectedEntries should be > 0"); + checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0"); + int nb = optimalNumOfBits(expectedEntries, fpp); + // make 'm' multiple of 64 + this.numBits = nb + (Long.SIZE - (nb % Long.SIZE)); + this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits); + this.bitSet = new BitSet(numBits); + } + + static int optimalNumOfHashFunctions(long n, long m) { + return Math.max(1, (int) Math.round((double) m / n * Math.log(2))); + } + + static int optimalNumOfBits(long n, double p) { + return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); + } + + public void add(byte[] val) { + if (val == null) { + addBytes(val, -1); + } else { + addBytes(val, val.length); + } + } + + public void addBytes(byte[] val, int length) { + // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter" + // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively + // implement a Bloom filter without any loss in the asymptotic false positive probability' + + // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned + // in the above paper + long hash64 = val == null ? Murmur3.NULL_HASHCODE : Murmur3.hash64(val, length); + addHash(hash64); + } + + private void addHash(long hash64) { + int hash1 = (int) hash64; + int hash2 = (int) (hash64 >>> 32); + + for (int i = 1; i <= numHashFunctions; i++) { + int combinedHash = hash1 + (i * hash2); + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + int pos = combinedHash % numBits; + bitSet.set(pos); + } + } + + public void addString(String val) { + if (val == null) { + add(null); + } else { + add(val.getBytes()); + } + } + + public void addLong(long val) { + addHash(getLongHash(val)); + } + + public void addDouble(double val) { + addLong(Double.doubleToLongBits(val)); + } + + public boolean test(byte[] val) { + if (val == null) { + return testBytes(val, -1); + } + return testBytes(val, val.length); + } + + public boolean testBytes(byte[] val, int length) { + long hash64 = val == null ? Murmur3.NULL_HASHCODE : Murmur3.hash64(val, length); + return testHash(hash64); + } + + private boolean testHash(long hash64) { + int hash1 = (int) hash64; + int hash2 = (int) (hash64 >>> 32); + + for (int i = 1; i <= numHashFunctions; i++) { + int combinedHash = hash1 + (i * hash2); + // hashcode should be positive, flip all the bits if it's negative + if (combinedHash < 0) { + combinedHash = ~combinedHash; + } + int pos = combinedHash % numBits; + if (!bitSet.get(pos)) { + return false; + } + } + return true; + } + + public boolean testString(String val) { + if (val == null) { + return test(null); + } else { + return test(val.getBytes()); + } + } + + public boolean testLong(long val) { + return testHash(getLongHash(val)); + } + + // Thomas Wang's integer hash function + // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm + private long getLongHash(long key) { + key = (~key) + (key << 21); // key = (key << 21) - key - 1; + key = key ^ (key >> 24); + key = (key + (key << 3)) + (key << 8); // key * 265 + key = key ^ (key >> 14); + key = (key + (key << 2)) + (key << 4); // key * 21 + key = key ^ (key >> 28); + key = key + (key << 31); + return key; + } + + public boolean testDouble(double val) { + return testLong(Double.doubleToLongBits(val)); + } + + public long sizeInBytes() { + return getBitSize() / 8; + } + + public int getBitSize() { + return bitSet.getData().length * Long.SIZE; + } + + public int getNumHashFunctions() { + return numHashFunctions; + } + + public long[] getBitSet() { + return bitSet.getData(); + } + + @Override + public String toString() { + return "m: " + numBits + " k: " + numHashFunctions; + } + + /** + * Merge the specified bloom filter with current bloom filter. + * + * @param that - bloom filter to merge + */ + public void merge(BloomFilter that) { + if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) { + this.bitSet.putAll(that.bitSet); + } else { + throw new IllegalArgumentException("BloomFilters are not compatible for merging." + + " this - " + this.toString() + " that - " + that.toString()); + } + } + + public void reset() { + this.bitSet.clear(); + } + + /** + * Bare metal bit set implementation. For performance reasons, this implementation does not check + * for index bounds nor expand the bit set size if the specified index is greater than the size. + */ + public class BitSet { + private final long[] data; + + public BitSet(long bits) { + this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]); + } + + /** + * Deserialize long array as bit set. + * + * @param data - bit array + */ + public BitSet(long[] data) { + assert data.length > 0 : "data length is zero!"; + this.data = data; + } + + /** + * Sets the bit at specified index. + * + * @param index - position + */ + public void set(int index) { + data[index >>> 6] |= (1L << index); + } + + /** + * Returns true if the bit is set in the specified index. + * + * @param index - position + * @return - value at the bit position + */ + public boolean get(int index) { + return (data[index >>> 6] & (1L << index)) != 0; + } + + /** + * Number of bits + */ + public long bitSize() { + return (long) data.length * Long.SIZE; + } + + public long[] getData() { + return data; + } + + /** + * Combines the two BitArrays using bitwise OR. + */ + public void putAll(BitSet array) { + assert data.length == array.data.length : + "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")"; + for (int i = 0; i < data.length; i++) { + data[i] |= array.data[i]; + } + } + + /** + * Clear the bit set. + */ + public void clear() { + Arrays.fill(data, 0); + } + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/util/Murmur3.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/util/Murmur3.java new file mode 100644 index 0000000000..98b3ce78e4 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/util/Murmur3.java @@ -0,0 +1,334 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc.util; + +/** + * Murmur3 is successor to Murmur2 fast non-crytographic hash algorithms. + * + * Murmur3 32 and 128 bit variants. + * 32-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#94 + * 128-bit Java port of https://code.google.com/p/smhasher/source/browse/trunk/MurmurHash3.cpp#255 + * + * This is a public domain code with no copyrights. + * From homepage of MurmurHash (https://code.google.com/p/smhasher/), + * "All MurmurHash versions are public domain software, and the author disclaims all copyright + * to their code." + */ +public class Murmur3 { + // from 64-bit linear congruential generator + public static final long NULL_HASHCODE = 2862933555777941757L; + + // Constants for 32 bit variant + private static final int C1_32 = 0xcc9e2d51; + private static final int C2_32 = 0x1b873593; + private static final int R1_32 = 15; + private static final int R2_32 = 13; + private static final int M_32 = 5; + private static final int N_32 = 0xe6546b64; + + // Constants for 128 bit variant + private static final long C1 = 0x87c37b91114253d5L; + private static final long C2 = 0x4cf5ad432745937fL; + private static final int R1 = 31; + private static final int R2 = 27; + private static final int R3 = 33; + private static final int M = 5; + private static final int N1 = 0x52dce729; + private static final int N2 = 0x38495ab5; + + private static final int DEFAULT_SEED = 104729; + + /** + * Murmur3 32-bit variant. + * + * @param data - input byte array + * @return - hashcode + */ + public static int hash32(byte[] data) { + return hash32(data, data.length, DEFAULT_SEED); + } + + /** + * Murmur3 32-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default 0) + * @return - hashcode + */ + public static int hash32(byte[] data, int length, int seed) { + int hash = seed; + final int nblocks = length >> 2; + + // body + for (int i = 0; i < nblocks; i++) { + int i_4 = i << 2; + int k = (data[i_4] & 0xff) + | ((data[i_4 + 1] & 0xff) << 8) + | ((data[i_4 + 2] & 0xff) << 16) + | ((data[i_4 + 3] & 0xff) << 24); + + // mix functions + k *= C1_32; + k = Integer.rotateLeft(k, R1_32); + k *= C2_32; + hash ^= k; + hash = Integer.rotateLeft(hash, R2_32) * M_32 + N_32; + } + + // tail + int idx = nblocks << 2; + int k1 = 0; + switch (length - idx) { + case 3: + k1 ^= data[idx + 2] << 16; + case 2: + k1 ^= data[idx + 1] << 8; + case 1: + k1 ^= data[idx]; + + // mix functions + k1 *= C1_32; + k1 = Integer.rotateLeft(k1, R1_32); + k1 *= C2_32; + hash ^= k1; + } + + // finalization + hash ^= length; + hash ^= (hash >>> 16); + hash *= 0x85ebca6b; + hash ^= (hash >>> 13); + hash *= 0xc2b2ae35; + hash ^= (hash >>> 16); + + return hash; + } + + /** + * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. + * + * @param data - input byte array + * @return - hashcode + */ + public static long hash64(byte[] data) { + return hash64(data, data.length, DEFAULT_SEED); + } + + public static long hash64(byte[] data, int length) { + return hash64(data, length, DEFAULT_SEED); + } + + /** + * Murmur3 64-bit variant. This is essentially MSB 8 bytes of Murmur3 128-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default is 0) + * @return - hashcode + */ + public static long hash64(byte[] data, int length, int seed) { + long hash = seed; + final int nblocks = length >> 3; + + // body + for (int i = 0; i < nblocks; i++) { + final int i8 = i << 3; + long k = ((long) data[i8] & 0xff) + | (((long) data[i8 + 1] & 0xff) << 8) + | (((long) data[i8 + 2] & 0xff) << 16) + | (((long) data[i8 + 3] & 0xff) << 24) + | (((long) data[i8 + 4] & 0xff) << 32) + | (((long) data[i8 + 5] & 0xff) << 40) + | (((long) data[i8 + 6] & 0xff) << 48) + | (((long) data[i8 + 7] & 0xff) << 56); + + // mix functions + k *= C1; + k = Long.rotateLeft(k, R1); + k *= C2; + hash ^= k; + hash = Long.rotateLeft(hash, R2) * M + N1; + } + + // tail + long k1 = 0; + int tailStart = nblocks << 3; + switch (length - tailStart) { + case 7: + k1 ^= ((long) data[tailStart + 6] & 0xff) << 48; + case 6: + k1 ^= ((long) data[tailStart + 5] & 0xff) << 40; + case 5: + k1 ^= ((long) data[tailStart + 4] & 0xff) << 32; + case 4: + k1 ^= ((long) data[tailStart + 3] & 0xff) << 24; + case 3: + k1 ^= ((long) data[tailStart + 2] & 0xff) << 16; + case 2: + k1 ^= ((long) data[tailStart + 1] & 0xff) << 8; + case 1: + k1 ^= ((long) data[tailStart] & 0xff); + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + hash ^= k1; + } + + // finalization + hash ^= length; + hash = fmix64(hash); + + return hash; + } + + /** + * Murmur3 128-bit variant. + * + * @param data - input byte array + * @return - hashcode (2 longs) + */ + public static long[] hash128(byte[] data) { + return hash128(data, data.length, DEFAULT_SEED); + } + + /** + * Murmur3 128-bit variant. + * + * @param data - input byte array + * @param length - length of array + * @param seed - seed. (default is 0) + * @return - hashcode (2 longs) + */ + public static long[] hash128(byte[] data, int length, int seed) { + long h1 = seed; + long h2 = seed; + final int nblocks = length >> 4; + + // body + for (int i = 0; i < nblocks; i++) { + final int i16 = i << 4; + long k1 = ((long) data[i16] & 0xff) + | (((long) data[i16 + 1] & 0xff) << 8) + | (((long) data[i16 + 2] & 0xff) << 16) + | (((long) data[i16 + 3] & 0xff) << 24) + | (((long) data[i16 + 4] & 0xff) << 32) + | (((long) data[i16 + 5] & 0xff) << 40) + | (((long) data[i16 + 6] & 0xff) << 48) + | (((long) data[i16 + 7] & 0xff) << 56); + + long k2 = ((long) data[i16 + 8] & 0xff) + | (((long) data[i16 + 9] & 0xff) << 8) + | (((long) data[i16 + 10] & 0xff) << 16) + | (((long) data[i16 + 11] & 0xff) << 24) + | (((long) data[i16 + 12] & 0xff) << 32) + | (((long) data[i16 + 13] & 0xff) << 40) + | (((long) data[i16 + 14] & 0xff) << 48) + | (((long) data[i16 + 15] & 0xff) << 56); + + // mix functions for k1 + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + h1 = Long.rotateLeft(h1, R2); + h1 += h2; + h1 = h1 * M + N1; + + // mix functions for k2 + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + h2 = Long.rotateLeft(h2, R1); + h2 += h1; + h2 = h2 * M + N2; + } + + // tail + long k1 = 0; + long k2 = 0; + int tailStart = nblocks << 4; + switch (length - tailStart) { + case 15: + k2 ^= (long) (data[tailStart + 14] & 0xff) << 48; + case 14: + k2 ^= (long) (data[tailStart + 13] & 0xff) << 40; + case 13: + k2 ^= (long) (data[tailStart + 12] & 0xff) << 32; + case 12: + k2 ^= (long) (data[tailStart + 11] & 0xff) << 24; + case 11: + k2 ^= (long) (data[tailStart + 10] & 0xff) << 16; + case 10: + k2 ^= (long) (data[tailStart + 9] & 0xff) << 8; + case 9: + k2 ^= (long) (data[tailStart + 8] & 0xff); + k2 *= C2; + k2 = Long.rotateLeft(k2, R3); + k2 *= C1; + h2 ^= k2; + + case 8: + k1 ^= (long) (data[tailStart + 7] & 0xff) << 56; + case 7: + k1 ^= (long) (data[tailStart + 6] & 0xff) << 48; + case 6: + k1 ^= (long) (data[tailStart + 5] & 0xff) << 40; + case 5: + k1 ^= (long) (data[tailStart + 4] & 0xff) << 32; + case 4: + k1 ^= (long) (data[tailStart + 3] & 0xff) << 24; + case 3: + k1 ^= (long) (data[tailStart + 2] & 0xff) << 16; + case 2: + k1 ^= (long) (data[tailStart + 1] & 0xff) << 8; + case 1: + k1 ^= (long) (data[tailStart] & 0xff); + k1 *= C1; + k1 = Long.rotateLeft(k1, R1); + k1 *= C2; + h1 ^= k1; + } + + // finalization + h1 ^= length; + h2 ^= length; + + h1 += h2; + h2 += h1; + + h1 = fmix64(h1); + h2 = fmix64(h2); + + h1 += h2; + h2 += h1; + + return new long[]{h1, h2}; + } + + private static long fmix64(long h) { + h ^= (h >>> 33); + h *= 0xff51afd7ed558ccdL; + h ^= (h >>> 33); + h *= 0xc4ceb9fe1a85ec53L; + h ^= (h >>> 33); + return h; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto b/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto new file mode 100644 index 0000000000..c80cf6c269 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/proto/orc_proto.proto @@ -0,0 +1,217 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.thirdparty.orc; + +message IntegerStatistics { + optional sint64 minimum = 1; + optional sint64 maximum = 2; + optional sint64 sum = 3; +} + +message DoubleStatistics { + optional double minimum = 1; + optional double maximum = 2; + optional double sum = 3; +} + +message StringStatistics { + optional string minimum = 1; + optional string maximum = 2; + // sum will store the total length of all strings in a stripe + optional sint64 sum = 3; +} + +message BucketStatistics { + repeated uint64 count = 1 [packed=true]; +} + +message DecimalStatistics { + optional string minimum = 1; + optional string maximum = 2; + optional string sum = 3; +} + +message DateStatistics { + // min,max values saved as days since epoch + optional sint32 minimum = 1; + optional sint32 maximum = 2; +} + +message TimestampStatistics { + // min,max values saved as milliseconds since epoch + optional sint64 minimum = 1; + optional sint64 maximum = 2; +} + +message BinaryStatistics { + // sum will store the total binary blob length in a stripe + optional sint64 sum = 1; +} + +message ColumnStatistics { + optional uint64 numberOfValues = 1; + optional IntegerStatistics intStatistics = 2; + optional DoubleStatistics doubleStatistics = 3; + optional StringStatistics stringStatistics = 4; + optional BucketStatistics bucketStatistics = 5; + optional DecimalStatistics decimalStatistics = 6; + optional DateStatistics dateStatistics = 7; + optional BinaryStatistics binaryStatistics = 8; + optional TimestampStatistics timestampStatistics = 9; + optional bool hasNull = 10; +} + +message RowIndexEntry { + repeated uint64 positions = 1 [packed=true]; + optional ColumnStatistics statistics = 2; +} + +message RowIndex { + repeated RowIndexEntry entry = 1; +} + +message BloomFilter { + optional uint32 numHashFunctions = 1; + repeated fixed64 bitset = 2; +} + +message BloomFilterIndex { + repeated BloomFilter bloomFilter = 1; +} + +message Stream { + // if you add new index stream kinds, you need to make sure to update + // StreamName to ensure it is added to the stripe in the right area + enum Kind { + PRESENT = 0; + DATA = 1; + LENGTH = 2; + DICTIONARY_DATA = 3; + DICTIONARY_COUNT = 4; + SECONDARY = 5; + ROW_INDEX = 6; + BLOOM_FILTER = 7; + } + optional Kind kind = 1; + optional uint32 column = 2; + optional uint64 length = 3; +} + +message ColumnEncoding { + enum Kind { + DIRECT = 0; + DICTIONARY = 1; + DIRECT_V2 = 2; + DICTIONARY_V2 = 3; + } + optional Kind kind = 1; + optional uint32 dictionarySize = 2; +} + +message StripeFooter { + repeated Stream streams = 1; + repeated ColumnEncoding columns = 2; + optional string writerTimezone = 3; +} + +message Type { + enum Kind { + BOOLEAN = 0; + BYTE = 1; + SHORT = 2; + INT = 3; + LONG = 4; + FLOAT = 5; + DOUBLE = 6; + STRING = 7; + BINARY = 8; + TIMESTAMP = 9; + LIST = 10; + MAP = 11; + STRUCT = 12; + UNION = 13; + DECIMAL = 14; + DATE = 15; + VARCHAR = 16; + CHAR = 17; + } + optional Kind kind = 1; + repeated uint32 subtypes = 2 [packed=true]; + repeated string fieldNames = 3; + optional uint32 maximumLength = 4; + optional uint32 precision = 5; + optional uint32 scale = 6; +} + +message StripeInformation { + optional uint64 offset = 1; + optional uint64 indexLength = 2; + optional uint64 dataLength = 3; + optional uint64 footerLength = 4; + optional uint64 numberOfRows = 5; +} + +message UserMetadataItem { + optional string name = 1; + optional bytes value = 2; +} + +message StripeStatistics { + repeated ColumnStatistics colStats = 1; +} + +message Metadata { + repeated StripeStatistics stripeStats = 1; +} + +message Footer { + optional uint64 headerLength = 1; + optional uint64 contentLength = 2; + repeated StripeInformation stripes = 3; + repeated Type types = 4; + repeated UserMetadataItem metadata = 5; + optional uint64 numberOfRows = 6; + repeated ColumnStatistics statistics = 7; + optional uint32 rowIndexStride = 8; +} + +enum CompressionKind { + NONE = 0; + ZLIB = 1; + SNAPPY = 2; + LZO = 3; +} + +// Serialized length must be less that 255 bytes +message PostScript { + optional uint64 footerLength = 1; + optional CompressionKind compression = 2; + optional uint64 compressionBlockSize = 3; + // the version of the file format + // [0, 11] = Hive 0.11 + // [0, 12] = Hive 0.12 + repeated uint32 version = 4 [packed = true]; + optional uint64 metadataLength = 5; + // Version of the writer: + // 0 (or missing) = original + // 1 = HIVE-8732 fixed + optional uint32 writerVersion = 6; + // Leave this last in the record + optional string magic = 8000; +} From 011e16649e8c6972a6e57c655ee22ef688d13029 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 3 Jul 2015 18:12:00 +0900 Subject: [PATCH 100/141] ObjectInspector Test --- .../ObjectInspectorFactory.java | 72 +++++++++++ .../TajoDoubleObjectInspector.java | 76 +++++++++++ .../TajoFloatObjectInspector.java | 77 +++++++++++ .../TajoIntObjectInspector.java | 76 +++++++++++ .../TajoLongObjectInspector.java | 76 +++++++++++ .../TajoPrimitiveObjectInspector.java | 38 ++++++ .../TajoShortObjectInspector.java | 76 +++++++++++ .../TajoStringObjectInspector.java | 71 ++++++++++ .../TajoStructObjectInspector.java | 122 ++++++++++++++++++ .../TajoTimestampObjectInspector.java | 73 +++++++++++ .../orc/{TestORCScanner.java => TestOrc.java} | 30 ++++- 11 files changed, 782 insertions(+), 5 deletions(-) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoPrimitiveObjectInspector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoTimestampObjectInspector.java rename tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/{TestORCScanner.java => TestOrc.java} (74%) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java new file mode 100644 index 0000000000..7dff7fc81c --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java @@ -0,0 +1,72 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.common.TajoDataTypes; +import org.apache.tajo.exception.UnsupportedException; + +public class ObjectInspectorFactory { + + public static StructObjectInspector buildStructObjectInspector(Schema schema) { + StructObjectInspector structOI = new TajoStructObjectInspector(schema); + return structOI; + } + + public static ObjectInspector buildObjectInspectorByType(TajoDataTypes.Type dataType) throws UnsupportedException { + ObjectInspector oi; + + switch(dataType) { + case INT2: + oi = new TajoShortObjectInspector(); + break; + + case INT4: + oi = new TajoIntObjectInspector(); + break; + + case INT8: + oi = new TajoLongObjectInspector(); + break; + + case FLOAT4: + oi = new TajoFloatObjectInspector(); + break; + + case FLOAT8: + oi = new TajoDoubleObjectInspector(); + break; + + case TEXT: + oi = new TajoStringObjectInspector(); + break; + + case TIMESTAMP: + oi = new TajoTimestampObjectInspector(); + break; + + default: + throw new UnsupportedException(dataType.name()+" is not supported yet in OrcAppender"); + } + + return oi; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java new file mode 100644 index 0000000000..c28553500d --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.tajo.datum.Float8Datum; + +public class TajoDoubleObjectInspector extends TajoPrimitiveObjectInspector implements DoubleObjectInspector { + @Override + public double get(Object o) { + return ((Float8Datum)o).asFloat8(); + } + + @Override + public PrimitiveTypeInfo getTypeInfo() { + return TypeInfoFactory.doubleTypeInfo; + } + + @Override + public PrimitiveCategory getPrimitiveCategory() { + return PrimitiveCategory.DOUBLE; + } + + @Override + public Class getPrimitiveWritableClass() { + return null; + } + + @Override + public Object getPrimitiveWritableObject(Object o) { + return null; + } + + @Override + public Class getJavaPrimitiveClass() { + return null; + } + + @Override + public Object getPrimitiveJavaObject(Object o) { + return null; + } + + @Override + public Object copyObject(Object o) { + return null; + } + + @Override + public boolean preferWritable() { + return false; + } + + @Override + public String getTypeName() { + return "FLOAT8"; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java new file mode 100644 index 0000000000..d1a7ad3f5b --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java @@ -0,0 +1,77 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.tajo.datum.Float4Datum; +import org.apache.tajo.datum.Int8Datum; + +public class TajoFloatObjectInspector extends TajoPrimitiveObjectInspector implements DoubleObjectInspector { + @Override + public double get(Object o) { + return ((Float4Datum)o).asFloat4(); + } + + @Override + public PrimitiveTypeInfo getTypeInfo() { + return TypeInfoFactory.floatTypeInfo; + } + + @Override + public PrimitiveCategory getPrimitiveCategory() { + return PrimitiveCategory.FLOAT; + } + + @Override + public Class getPrimitiveWritableClass() { + return null; + } + + @Override + public Object getPrimitiveWritableObject(Object o) { + return null; + } + + @Override + public Class getJavaPrimitiveClass() { + return null; + } + + @Override + public Object getPrimitiveJavaObject(Object o) { + return null; + } + + @Override + public Object copyObject(Object o) { + return null; + } + + @Override + public boolean preferWritable() { + return false; + } + + @Override + public String getTypeName() { + return "FLOAT4"; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java new file mode 100644 index 0000000000..9718fef218 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.tajo.datum.Int4Datum; + +public class TajoIntObjectInspector extends TajoPrimitiveObjectInspector implements IntObjectInspector { + @Override + public int get(Object o) { + return ((Int4Datum)o).asInt4(); + } + + @Override + public PrimitiveTypeInfo getTypeInfo() { + return TypeInfoFactory.intTypeInfo; + } + + @Override + public PrimitiveCategory getPrimitiveCategory() { + return PrimitiveCategory.INT; + } + + @Override + public Class getPrimitiveWritableClass() { + return null; + } + + @Override + public Object getPrimitiveWritableObject(Object o) { + return null; + } + + @Override + public Class getJavaPrimitiveClass() { + return Integer.class; + } + + @Override + public Object getPrimitiveJavaObject(Object o) { + return null; + } + + @Override + public Object copyObject(Object o) { + return null; + } + + @Override + public boolean preferWritable() { + return false; + } + + @Override + public String getTypeName() { + return "INT4"; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java new file mode 100644 index 0000000000..bc6610a31a --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.tajo.datum.Int8Datum; + +public class TajoLongObjectInspector extends TajoPrimitiveObjectInspector implements LongObjectInspector { + @Override + public long get(Object o) { + return ((Int8Datum)o).asInt8(); + } + + @Override + public PrimitiveTypeInfo getTypeInfo() { + return TypeInfoFactory.shortTypeInfo; + } + + @Override + public PrimitiveCategory getPrimitiveCategory() { + return PrimitiveCategory.SHORT; + } + + @Override + public Class getPrimitiveWritableClass() { + return null; + } + + @Override + public Object getPrimitiveWritableObject(Object o) { + return null; + } + + @Override + public Class getJavaPrimitiveClass() { + return Integer.class; + } + + @Override + public Object getPrimitiveJavaObject(Object o) { + return null; + } + + @Override + public Object copyObject(Object o) { + return null; + } + + @Override + public boolean preferWritable() { + return false; + } + + @Override + public String getTypeName() { + return "INT8"; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoPrimitiveObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoPrimitiveObjectInspector.java new file mode 100644 index 0000000000..90ac178fdd --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoPrimitiveObjectInspector.java @@ -0,0 +1,38 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; + +public abstract class TajoPrimitiveObjectInspector implements PrimitiveObjectInspector { + @Override + public Category getCategory() { + return Category.PRIMITIVE; + } + + @Override + public int precision() { + return 0; + } + + @Override + public int scale() { + return 0; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java new file mode 100644 index 0000000000..08ed694662 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java @@ -0,0 +1,76 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.tajo.datum.Int2Datum; + +public class TajoShortObjectInspector extends TajoPrimitiveObjectInspector implements ShortObjectInspector { + @Override + public short get(Object o) { + return ((Int2Datum)o).asInt2(); + } + + @Override + public PrimitiveTypeInfo getTypeInfo() { + return TypeInfoFactory.shortTypeInfo; + } + + @Override + public PrimitiveCategory getPrimitiveCategory() { + return PrimitiveCategory.SHORT; + } + + @Override + public Class getPrimitiveWritableClass() { + return null; + } + + @Override + public Object getPrimitiveWritableObject(Object o) { + return null; + } + + @Override + public Class getJavaPrimitiveClass() { + return Integer.class; + } + + @Override + public Object getPrimitiveJavaObject(Object o) { + return null; + } + + @Override + public Object copyObject(Object o) { + return null; + } + + @Override + public boolean preferWritable() { + return false; + } + + @Override + public String getTypeName() { + return "INT2"; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java new file mode 100644 index 0000000000..a980f7cf89 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.Text; + +public class TajoStringObjectInspector extends TajoPrimitiveObjectInspector implements StringObjectInspector { + @Override + public PrimitiveTypeInfo getTypeInfo() { + return TypeInfoFactory.stringTypeInfo; + } + + @Override + public PrimitiveCategory getPrimitiveCategory() { + return PrimitiveCategory.STRING; + } + + @Override + public Class getPrimitiveWritableClass() { + return null; + } + + @Override + public Text getPrimitiveWritableObject(Object o) { + return null; + } + + @Override + public Class getJavaPrimitiveClass() { + return null; + } + + @Override + public String getPrimitiveJavaObject(Object o) { + return null; + } + + @Override + public Object copyObject(Object o) { + return null; + } + + @Override + public boolean preferWritable() { + return false; + } + + @Override + public String getTypeName() { + return "TEXT"; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java new file mode 100644 index 0000000000..c2ac1954ec --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; +import org.apache.tajo.catalog.Column; +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.datum.Datum; +import org.apache.tajo.exception.UnsupportedException; +import org.apache.tajo.storage.Tuple; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class TajoStructObjectInspector extends StructObjectInspector { + private final static Log LOG = LogFactory.getLog(TajoStructObjectInspector.class); + private Schema schema; + private List structFields; + + static class TajoStructField implements StructField { + private String name; + private ObjectInspector oi; + private String comment; + + TajoStructField(String name, ObjectInspector oi) { + this(name, oi, null); + } + + TajoStructField(String name, ObjectInspector oi, String comment) { + this.name = name; + this.oi = oi; + this.comment = comment; + } + + @Override + public String getFieldName() { + return name; + } + + @Override + public ObjectInspector getFieldObjectInspector() { + return oi; + } + + @Override + public String getFieldComment() { + return comment; + } + } + + TajoStructObjectInspector(Schema schema) { + this.schema = schema; + structFields = new ArrayList(schema.size()); + + for (Column c: schema.getColumns()) { + try { + TajoStructField field = new TajoStructField(c.getSimpleName(), + ObjectInspectorFactory.buildObjectInspectorByType(c.getDataType().getType())); + structFields.add(field); + } catch (UnsupportedException e) { + LOG.error(e.getMessage()); + } + } + } + + @Override + public List getAllStructFieldRefs() { + return structFields; + } + + @Override + public StructField getStructFieldRef(String s) { + for (TajoStructField field:structFields) { + if (field.getFieldName().equals(s)) { + return field; + } + } + + return null; + } + + @Override + public Object getStructFieldData(Object o, StructField structField) { + return null; + } + + @Override + public List getStructFieldsDataAsList(Object o) { + return null; + } + + @Override + public String getTypeName() { + return "struct"; + } + + @Override + public Category getCategory() { + return Category.STRUCT; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoTimestampObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoTimestampObjectInspector.java new file mode 100644 index 0000000000..bb887e79da --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoTimestampObjectInspector.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.io.TimestampWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +import java.sql.Timestamp; + +public class TajoTimestampObjectInspector extends TajoPrimitiveObjectInspector implements TimestampObjectInspector { + @Override + public PrimitiveTypeInfo getTypeInfo() { + return TypeInfoFactory.timestampTypeInfo; + } + + @Override + public PrimitiveCategory getPrimitiveCategory() { + return PrimitiveCategory.TIMESTAMP; + } + + @Override + public Class getPrimitiveWritableClass() { + return null; + } + + @Override + public TimestampWritable getPrimitiveWritableObject(Object o) { + return null; + } + + @Override + public Class getJavaPrimitiveClass() { + return null; + } + + @Override + public Timestamp getPrimitiveJavaObject(Object o) { + return null; + } + + @Override + public Object copyObject(Object o) { + return null; + } + + @Override + public boolean preferWritable() { + return false; + } + + @Override + public String getTypeName() { + return "TIMESTAMP"; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java similarity index 74% rename from tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestORCScanner.java rename to tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java index b4117931fe..88f2eca25d 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestORCScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java @@ -22,14 +22,18 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.serde2.objectinspector.StructField; +import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.catalog.proto.CatalogProtos; import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.conf.TajoConf; import org.apache.tajo.datum.TimestampDatum; import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.fragment.FileFragment; import org.apache.tajo.storage.fragment.Fragment; +import org.apache.tajo.storage.orc.objectinspector.ObjectInspectorFactory; import org.apache.tajo.util.KeyValueSet; import org.junit.After; import org.junit.Before; @@ -39,9 +43,10 @@ import java.io.IOException; import java.net.URL; +import java.util.List; -public class TestORCScanner { - private ORCScanner orcScanner; +public class TestOrc { + private OrcScanner orcScanner; public static Path getResourcePath(String path, String suffix) { URL resultBaseURL = ClassLoader.getSystemResource(path); @@ -66,11 +71,11 @@ public void setup() throws IOException { Configuration conf = new TajoConf(); - TableMeta meta = new TableMeta("ORC", new KeyValueSet()); + TableMeta meta = new TableMeta(CatalogProtos.StoreType.ORC, new KeyValueSet()); Fragment fragment = getFileFragment(conf, "u_data_20.orc"); - orcScanner = new ORCScanner(conf, schema, meta, fragment); + orcScanner = new OrcScanner(conf, schema, meta, fragment); orcScanner.init(); } @@ -86,7 +91,7 @@ public void testReadTuple() { assertEquals(tuple.getText(3), "881250949"); // Timestamp test - TimestampDatum timestamp = (TimestampDatum)tuple.asDatum(4); + TimestampDatum timestamp = (TimestampDatum)tuple.get(4); assertEquals(timestamp.getYear(), 2008); assertEquals(timestamp.getMonthOfYear(), 12); @@ -104,4 +109,19 @@ public void end() { e.printStackTrace(); } } + + @Test + public void testWrite() { + Schema schema = new Schema(); + schema.addColumn("movieid", TajoDataTypes.Type.INT4); + schema.addColumn("rating", TajoDataTypes.Type.INT2); + schema.addColumn("comment", TajoDataTypes.Type.TEXT); + schema.addColumn("showtime", TajoDataTypes.Type.TIMESTAMP); + + StructObjectInspector structOI = ObjectInspectorFactory.buildStructObjectInspector(schema); + List fieldList = structOI.getAllStructFieldRefs(); + StructField midField = fieldList.get(0); + + assertEquals("movieid", midField.getFieldName()); + } } \ No newline at end of file From b96d4a2ce444eae660d521278902995b3a890160 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 3 Jul 2015 18:30:07 +0900 Subject: [PATCH 101/141] Move method position --- .../org/apache/tajo/storage/orc/TestOrc.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java index 88f2eca25d..9c91f09b1e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java @@ -101,15 +101,6 @@ public void testReadTuple() { } } - @After - public void end() { - try { - orcScanner.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } - @Test public void testWrite() { Schema schema = new Schema(); @@ -124,4 +115,13 @@ public void testWrite() { assertEquals("movieid", midField.getFieldName()); } + + @After + public void end() { + try { + orcScanner.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } } \ No newline at end of file From 522f11a7ca15c74773d72db84adc27aa2bd9141f Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Tue, 7 Jul 2015 13:50:02 +0900 Subject: [PATCH 102/141] basic test passed --- .../thirdparty/orc/ColumnStatisticsImpl.java | 49 +- .../thirdparty/orc/OrcNewOutputFormat.java | 78 --- .../thirdparty/orc/OrcOutputFormat.java | 189 ------- .../thirdparty/orc/StringRedBlackTree.java | 10 +- .../tajo/storage/thirdparty/orc/Writer.java | 9 +- .../storage/thirdparty/orc/WriterImpl.java | 492 +++--------------- .../org/apache/tajo/storage/orc/TestOrc.java | 59 ++- 7 files changed, 164 insertions(+), 722 deletions(-) delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcNewOutputFormat.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcOutputFormat.java diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java index be2157a7cc..78cce1efa6 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java @@ -21,8 +21,7 @@ import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.hadoop.io.BytesWritable; -import org.apache.hadoop.io.Text; +import org.apache.tajo.datum.BlobDatum; import java.sql.Date; import java.sql.Timestamp; @@ -358,8 +357,8 @@ public String toString() { protected static final class StringStatisticsImpl extends ColumnStatisticsImpl implements StringColumnStatistics { - private Text minimum = null; - private Text maximum = null; + private String minimum = null; + private String maximum = null; private long sum = 0; StringStatisticsImpl() { @@ -369,10 +368,10 @@ protected static final class StringStatisticsImpl extends ColumnStatisticsImpl super(stats); OrcProto.StringStatistics str = stats.getStringStatistics(); if (str.hasMaximum()) { - maximum = new Text(str.getMaximum()); + maximum = str.getMaximum(); } if (str.hasMinimum()) { - minimum = new Text(str.getMinimum()); + minimum = str.getMinimum(); } if(str.hasSum()) { sum = str.getSum(); @@ -388,15 +387,15 @@ void reset() { } @Override - void updateString(Text value) { + void updateString(String value) { if (minimum == null) { - maximum = minimum = new Text(value); + maximum = minimum = value; } else if (minimum.compareTo(value) > 0) { - minimum = new Text(value); + minimum = value; } else if (maximum.compareTo(value) < 0) { - maximum = new Text(value); + maximum = value; } - sum += value.getLength(); + sum += value.length(); } @Override @@ -405,18 +404,18 @@ void merge(ColumnStatisticsImpl other) { StringStatisticsImpl str = (StringStatisticsImpl) other; if (minimum == null) { if (str.minimum != null) { - maximum = new Text(str.getMaximum()); - minimum = new Text(str.getMinimum()); + maximum = str.getMaximum(); + minimum = str.getMinimum(); } else { /* both are empty */ maximum = minimum = null; } } else if (str.minimum != null) { if (minimum.compareTo(str.minimum) > 0) { - minimum = new Text(str.getMinimum()); + minimum = str.getMinimum(); } if (maximum.compareTo(str.maximum) < 0) { - maximum = new Text(str.getMaximum()); + maximum = str.getMaximum(); } } sum += str.sum; @@ -444,12 +443,12 @@ OrcProto.ColumnStatistics.Builder serialize() { @Override public String getMinimum() { - return minimum == null ? null : minimum.toString(); + return minimum; } @Override public String getMaximum() { - return maximum == null ? null : maximum.toString(); + return maximum; } @Override @@ -495,8 +494,8 @@ void reset() { } @Override - void updateBinary(BytesWritable value) { - sum += value.getLength(); + void updateBinary(BlobDatum value) { + sum += value.size(); } @Override @@ -877,11 +876,7 @@ public String toString() { count = stats.getNumberOfValues(); } - if (stats.hasHasNull()) { - hasNull = stats.getHasNull(); - } else { - hasNull = true; - } + hasNull = !stats.hasHasNull() || stats.getHasNull(); } ColumnStatisticsImpl() { @@ -907,11 +902,11 @@ void updateDouble(double value) { throw new UnsupportedOperationException("Can't update double"); } - void updateString(Text value) { + void updateString(String value) { throw new UnsupportedOperationException("Can't update string"); } - void updateBinary(BytesWritable value) { + void updateBinary(BlobDatum value) { throw new UnsupportedOperationException("Can't update binary"); } @@ -928,7 +923,7 @@ void updateTimestamp(Timestamp value) { } boolean isStatsExists() { - return (count > 0 || hasNull == true); + return (count > 0 || hasNull); } void merge(ColumnStatisticsImpl stats) { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcNewOutputFormat.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcNewOutputFormat.java deleted file mode 100644 index f5fd2ab78c..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcNewOutputFormat.java +++ /dev/null @@ -1,78 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; -import org.apache.hadoop.hive.shims.ShimLoader; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapreduce.RecordWriter; -import org.apache.hadoop.mapreduce.TaskAttemptContext; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; - -import java.io.IOException; -import java.util.ArrayList; - -/** An OutputFormat that writes ORC files. */ -public class OrcNewOutputFormat extends - FileOutputFormat { - - private static class OrcRecordWriter - extends RecordWriter { - private Writer writer = null; - private final Path path; - private final OrcFile.WriterOptions options; - OrcRecordWriter(Path path, OrcFile.WriterOptions options) { - this.path = path; - this.options = options; - } - @Override - public void write(NullWritable key, OrcSerde.OrcSerdeRow row) - throws IOException, InterruptedException { - if (writer == null) { - options.inspector(row.getInspector()); - writer = OrcFile.createWriter(path, options); - } - writer.addRow(row.getRow()); - } - - @Override - public void close(TaskAttemptContext context) - throws IOException, InterruptedException { - if (writer == null) { - // a row with no columns - ObjectInspector inspector = ObjectInspectorFactory. - getStandardStructObjectInspector(new ArrayList(), - new ArrayList()); - options.inspector(inspector); - writer = OrcFile.createWriter(path, options); - } - writer.close(); - } - } - - @Override - public RecordWriter getRecordWriter(TaskAttemptContext context) - throws IOException, InterruptedException { - Path file = getDefaultWorkFile(context, ""); - return new - OrcRecordWriter(file, OrcFile.writerOptions( - ShimLoader.getHadoopShims().getConfiguration(context))); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcOutputFormat.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcOutputFormat.java deleted file mode 100644 index eceaa97f7d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcOutputFormat.java +++ /dev/null @@ -1,189 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.ql.io.StatsProvidingRecordWriter; -import org.apache.hadoop.hive.serde2.SerDeStats; -import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Writable; -import org.apache.hadoop.mapred.FileOutputFormat; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordWriter; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.util.Progressable; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Properties; - -/** - * A Hive OutputFormat for ORC files. - */ -public class OrcOutputFormat extends FileOutputFormat { - private static class OrcRecordWriter - implements RecordWriter, - StatsProvidingRecordWriter { - private Writer writer = null; - private final Path path; - private final OrcFile.WriterOptions options; - private final SerDeStats stats; - - OrcRecordWriter(Path path, OrcFile.WriterOptions options) { - this.path = path; - this.options = options; - this.stats = new SerDeStats(); - } - - @Override - public void write(NullWritable nullWritable, - OrcSerde.OrcSerdeRow row) throws IOException { - if (writer == null) { - options.inspector(row.getInspector()); - writer = OrcFile.createWriter(path, options); - } - writer.addRow(row.getRow()); - } - - @Override - public void write(Writable row) throws IOException { - OrcSerde.OrcSerdeRow serdeRow = (OrcSerde.OrcSerdeRow) row; - if (writer == null) { - options.inspector(serdeRow.getInspector()); - writer = OrcFile.createWriter(path, options); - } - writer.addRow(serdeRow.getRow()); - } - - @Override - public void close(Reporter reporter) throws IOException { - close(true); - } - - @Override - public void close(boolean b) throws IOException { - // if we haven't written any rows, we need to create a file with a - // generic schema. - if (writer == null) { - // a row with no columns - ObjectInspector inspector = ObjectInspectorFactory. - getStandardStructObjectInspector(new ArrayList(), - new ArrayList()); - options.inspector(inspector); - writer = OrcFile.createWriter(path, options); - } - writer.close(); - } - - @Override - public SerDeStats getStats() { - stats.setRawDataSize(writer.getRawDataSize()); - stats.setRowCount(writer.getNumberOfRows()); - return stats; - } - } - - /** - * Helper method to get a parameter first from props if present, falling back to JobConf if not. - * Returns null if key is present in neither. - */ - private String getSettingFromPropsFallingBackToConf(String key, Properties props, JobConf conf){ - if ((props != null) && props.containsKey(key)){ - return props.getProperty(key); - } else if(conf != null) { - // If conf is not null, and the key is not present, Configuration.get() will - // return null for us. So, we don't have to check if it contains it. - return conf.get(key); - } else { - return null; - } - } - - private OrcFile.WriterOptions getOptions(JobConf conf, Properties props) { - OrcFile.WriterOptions options = OrcFile.writerOptions(conf); - String propVal ; - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.STRIPE_SIZE.getPropName(),props,conf)) != null){ - options.stripeSize(Long.parseLong(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.COMPRESSION.getPropName(),props,conf)) != null){ - options.compress(CompressionKind.valueOf(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.COMPRESSION_BLOCK_SIZE.getPropName(),props,conf)) != null){ - options.bufferSize(Integer.parseInt(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.ROW_INDEX_STRIDE.getPropName(),props,conf)) != null){ - options.rowIndexStride(Integer.parseInt(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.ENABLE_INDEXES.getPropName(),props,conf)) != null){ - if ("false".equalsIgnoreCase(propVal)) { - options.rowIndexStride(0); - } - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.BLOCK_PADDING.getPropName(),props,conf)) != null){ - options.blockPadding(Boolean.parseBoolean(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.ENCODING_STRATEGY.getPropName(),props,conf)) != null){ - options.encodingStrategy(OrcFile.EncodingStrategy.valueOf(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.BLOOM_FILTER_COLUMNS.getPropName(), props, conf)) != null) { - options.bloomFilterColumns(propVal); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.BLOOM_FILTER_FPP.getPropName(), props, conf)) != null) { - options.bloomFilterFpp(Double.parseDouble(propVal)); - } - - return options; - } - - @Override - public RecordWriter - getRecordWriter(FileSystem fileSystem, JobConf conf, String name, - Progressable reporter) throws IOException { - return new - OrcRecordWriter(new Path(name), getOptions(conf,null)); - } - - public StatsProvidingRecordWriter - getHiveRecordWriter(JobConf conf, - Path path, - Class valueClass, - boolean isCompressed, - Properties tableProperties, - Progressable reporter) throws IOException { - return new OrcRecordWriter(path, getOptions(conf,tableProperties)); - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java index 7c698d14a5..8835cefa5e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/StringRedBlackTree.java @@ -29,7 +29,7 @@ class StringRedBlackTree extends RedBlackTree { private final DynamicByteArray byteArray = new DynamicByteArray(); private final DynamicIntArray keyOffsets; - private final Text newKey = new Text(); + private String newKey; public StringRedBlackTree(int initialCapacity) { super(initialCapacity); @@ -37,21 +37,21 @@ public StringRedBlackTree(int initialCapacity) { } public int add(String value) { - newKey.set(value); + newKey = value; return addNewKey(); } private int addNewKey() { // if the newKey is actually new, add it to our byteArray and store the offset & length if (add()) { - int len = newKey.getLength(); + int len = newKey.length(); keyOffsets.add(byteArray.add(newKey.getBytes(), 0, len)); } return lastAdd; } public int add(Text value) { - newKey.set(value); + newKey = value.toString(); return addNewKey(); } @@ -64,7 +64,7 @@ protected int compareValue(int position) { } else { end = keyOffsets.get(position+1); } - return byteArray.compare(newKey.getBytes(), 0, newKey.getLength(), + return byteArray.compare(newKey.getBytes(), 0, newKey.length(), start, end - start); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java index d8dae5343d..669b44fbd3 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/Writer.java @@ -18,6 +18,8 @@ package org.apache.tajo.storage.thirdparty.orc; +import org.apache.tajo.storage.Tuple; + import java.io.IOException; import java.nio.ByteBuffer; import java.util.List; @@ -35,12 +37,7 @@ public interface Writer { */ void addUserMetadata(String key, ByteBuffer value); - /** - * Add a row to the ORC file. - * @param row the row to add - * @throws IOException - */ - void addRow(Object row) throws IOException; + void addTuple(Tuple tuple) throws IOException; /** * Flush all of the buffers and close the file. No methods on this writer diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index 4753f1b321..7683995165 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -30,23 +30,23 @@ import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.io.IOConstants; import org.apache.hadoop.hive.shims.ShimLoader; +import org.apache.tajo.datum.*; +import org.apache.tajo.storage.Tuple; import org.apache.tajo.storage.thirdparty.orc.CompressionCodec.Modifier; import org.apache.tajo.storage.thirdparty.orc.OrcProto.RowIndexEntry; import org.apache.tajo.storage.thirdparty.orc.OrcProto.StripeStatistics; import org.apache.tajo.storage.thirdparty.orc.OrcProto.Type; import org.apache.tajo.storage.thirdparty.orc.OrcProto.UserMetadataItem; import org.apache.hadoop.hive.ql.util.JavaDataModel; -import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.objectinspector.*; import org.apache.hadoop.hive.serde2.objectinspector.primitive.*; import org.apache.hadoop.hive.serde2.typeinfo.CharTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.VarcharTypeInfo; -import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; +import org.apache.tajo.util.datetime.DateTimeUtil; import java.io.IOException; import java.io.OutputStream; @@ -111,7 +111,6 @@ public class WriterImpl implements Writer, MemoryManager.Callback { new ArrayList(); private final Map userMetadata = new TreeMap(); - private final StreamFactory streamFactory = new StreamFactory(); private final TreeWriter treeWriter; private final boolean buildIndex; private final MemoryManager memoryManager; @@ -184,7 +183,7 @@ public Writer getWriter() { OrcUtils.includeColumns(bloomFilterColumnNames, allColumns, inspector); } this.bloomFilterFpp = bloomFilterFpp; - treeWriter = createTreeWriter(inspector, streamFactory, false); + treeWriter = createTreeWriter(inspector, new StreamFactory(), false); if (buildIndex && rowIndexStride < MIN_ROW_INDEX_STRIDE) { throw new IllegalArgumentException("Row stride must be at least " + MIN_ROW_INDEX_STRIDE); @@ -703,19 +702,38 @@ boolean isNewWriteFormat(StreamFactory writer) { /** * Add a new value to the column. - * @param obj + * @param datum * @throws IOException */ - void write(Object obj) throws IOException { - if (obj != null) { + void write(Datum datum) throws IOException { + if (datum != null && datum.isNotNull()) { indexStatistics.increment(); } else { indexStatistics.setNull(); } if (isPresent != null) { - isPresent.write(obj == null ? 0 : 1); - if(obj == null) { + if(datum == null || datum.isNull()) { foundNulls = true; + isPresent.write(0); + } + else { + isPresent.write(1); + } + } + } + + void write(Tuple tuple) throws IOException { + if (tuple != null) { + indexStatistics.increment(); + } else { + indexStatistics.setNull(); + } + if (isPresent != null) { + if (tuple == null) { + foundNulls = true; + isPresent.write(0); + } else { + isPresent.write(1); } } } @@ -881,10 +899,10 @@ private static class BooleanTreeWriter extends TreeWriter { } @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - boolean val = ((BooleanObjectInspector) inspector).get(obj); + void write(Datum datum) throws IOException { + super.write(datum); + if (datum != null && datum.isNotNull()) { + boolean val = datum.asBool(); indexStatistics.updateBoolean(val); writer.write(val ? 1 : 0); } @@ -919,10 +937,10 @@ private static class ByteTreeWriter extends TreeWriter { } @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - byte val = ((ByteObjectInspector) inspector).get(obj); + void write(Datum datum) throws IOException { + super.write(datum); + if (datum != null && datum.isNotNull()) { + byte val = datum.asByte(); indexStatistics.updateInteger(val); if (createBloomFilter) { bloomFilter.addLong(val); @@ -990,16 +1008,16 @@ OrcProto.ColumnEncoding getEncoding() { } @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { + void write(Datum datum) throws IOException { + super.write(datum); + if (datum != null && datum.isNotNull()) { long val; - if (intInspector != null) { - val = intInspector.get(obj); - } else if (longInspector != null) { - val = longInspector.get(obj); + if (datum instanceof Int4Datum) { + val = datum.asInt4(); + } else if (datum instanceof Int8Datum) { + val = datum.asInt8(); } else { - val = shortInspector.get(obj); + val = datum.asInt2(); } indexStatistics.updateInteger(val); if (createBloomFilter) { @@ -1041,10 +1059,10 @@ private static class FloatTreeWriter extends TreeWriter { } @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - float val = ((FloatObjectInspector) inspector).get(obj); + void write(Datum datum) throws IOException { + super.write(datum); + if (datum != null && datum.isNotNull()) { + float val = datum.asFloat4(); indexStatistics.updateDouble(val); if (createBloomFilter) { // floats are converted to doubles in column statistics and during SARG evaluation @@ -1085,10 +1103,10 @@ private static class DoubleTreeWriter extends TreeWriter { } @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - double val = ((DoubleObjectInspector) inspector).get(obj); + void write(Datum datum) throws IOException { + super.write(datum); + if (datum != null && datum.isNotNull()) { + double val = datum.asFloat8(); indexStatistics.updateDouble(val); if (createBloomFilter) { bloomFilter.addDouble(val); @@ -1161,31 +1179,20 @@ private static class StringTreeWriter extends TreeWriter { doneDictionaryCheck = false; } - /** - * Method to retrieve text values from the value object, which can be overridden - * by subclasses. - * @param obj value - * @return Text text value from obj - */ - Text getTextValue(Object obj) { - return ((StringObjectInspector) inspector).getPrimitiveWritableObject(obj); - } - @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - Text val = getTextValue(obj); + void write(Datum datum) throws IOException { + super.write(datum); + if (datum != null && datum.isNotNull()) { if (useDictionaryEncoding || !strideDictionaryCheck) { - rows.add(dictionary.add(val)); + rows.add(dictionary.add(datum.toString())); } else { // write data and length - directStreamOutput.write(val.getBytes(), 0, val.getLength()); - directLengthOutput.write(val.getLength()); + directStreamOutput.write(datum.asByteArray(), 0, datum.size()); + directLengthOutput.write(datum.size()); } - indexStatistics.updateString(val); + indexStatistics.updateString(datum.toString()); if (createBloomFilter) { - bloomFilter.addBytes(val.getBytes(), val.getLength()); + bloomFilter.addBytes(datum.asByteArray(), datum.size()); } } } @@ -1380,15 +1387,6 @@ private static class CharTreeWriter extends StringTreeWriter { boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); } - - /** - * Override base class implementation to support char values. - */ - @Override - Text getTextValue(Object obj) { - return (((HiveCharObjectInspector) inspector) - .getPrimitiveWritableObject(obj)).getTextValue(); - } } /** @@ -1402,15 +1400,6 @@ private static class VarcharTreeWriter extends StringTreeWriter { boolean nullable) throws IOException { super(columnId, inspector, writer, nullable); } - - /** - * Override base class implementation to support varchar values. - */ - @Override - Text getTextValue(Object obj) { - return (((HiveVarcharObjectInspector) inspector) - .getPrimitiveWritableObject(obj)).getTextValue(); - } } private static class BinaryTreeWriter extends TreeWriter { @@ -1442,16 +1431,15 @@ OrcProto.ColumnEncoding getEncoding() { } @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - BytesWritable val = - ((BinaryObjectInspector) inspector).getPrimitiveWritableObject(obj); - stream.write(val.getBytes(), 0, val.getLength()); - length.write(val.getLength()); + void write(Datum datum) throws IOException { + super.write(datum); + if (datum != null && datum.isNotNull()) { + BlobDatum val = (BlobDatum)datum; + stream.write(val.asByteArray(), 0, val.size()); + length.write(val.size()); indexStatistics.updateBinary(val); if (createBloomFilter) { - bloomFilter.addBytes(val.getBytes(), val.getLength()); + bloomFilter.addBytes(val.asByteArray(), val.size()); } } } @@ -1508,12 +1496,10 @@ OrcProto.ColumnEncoding getEncoding() { } @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - Timestamp val = - ((TimestampObjectInspector) inspector). - getPrimitiveJavaObject(obj); + void write(Datum datum) throws IOException { + super.write(datum); + if (datum != null && datum.isNotNull()) { + Timestamp val = new Timestamp(DateTimeUtil.julianTimeToJavaTime(datum.asInt8())); indexStatistics.updateTimestamp(val); seconds.write((val.getTime() / MILLIS_PER_SECOND) - base_timestamp); nanos.write(formatNanos(val.getNanos())); @@ -1556,124 +1542,6 @@ void recordPosition(PositionRecorder recorder) throws IOException { } } - private static class DateTreeWriter extends TreeWriter { - private final IntegerWriter writer; - private final boolean isDirectV2; - - DateTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); - OutStream out = writer.createStream(id, - OrcProto.Stream.Kind.DATA); - this.isDirectV2 = isNewWriteFormat(writer); - this.writer = createIntegerWriter(out, true, isDirectV2, writer); - recordPosition(rowIndexPosition); - } - - @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - // Using the Writable here as it's used directly for writing as well as for stats. - DateWritable val = ((DateObjectInspector) inspector).getPrimitiveWritableObject(obj); - indexStatistics.updateDate(val); - writer.write(val.getDays()); - if (createBloomFilter) { - bloomFilter.addLong(val.getDays()); - } - } - } - - @Override - void writeStripe(OrcProto.StripeFooter.Builder builder, - int requiredIndexEntries) throws IOException { - super.writeStripe(builder, requiredIndexEntries); - writer.flush(); - recordPosition(rowIndexPosition); - } - - @Override - void recordPosition(PositionRecorder recorder) throws IOException { - super.recordPosition(recorder); - writer.getPosition(recorder); - } - - @Override - OrcProto.ColumnEncoding getEncoding() { - if (isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); - } - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); - } - } - - private static class DecimalTreeWriter extends TreeWriter { - private final PositionedOutputStream valueStream; - private final IntegerWriter scaleStream; - private final boolean isDirectV2; - - DecimalTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); - this.isDirectV2 = isNewWriteFormat(writer); - valueStream = writer.createStream(id, OrcProto.Stream.Kind.DATA); - this.scaleStream = createIntegerWriter(writer.createStream(id, - OrcProto.Stream.Kind.SECONDARY), true, isDirectV2, writer); - recordPosition(rowIndexPosition); - } - - @Override - OrcProto.ColumnEncoding getEncoding() { - if (isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); - } - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); - } - - @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - HiveDecimal decimal = ((HiveDecimalObjectInspector) inspector). - getPrimitiveJavaObject(obj); - if (decimal == null) { - return; - } - SerializationUtils.writeBigInteger(valueStream, - decimal.unscaledValue()); - scaleStream.write(decimal.scale()); - indexStatistics.updateDecimal(decimal); - if (createBloomFilter) { - bloomFilter.addString(decimal.toString()); - } - } - } - - @Override - void writeStripe(OrcProto.StripeFooter.Builder builder, - int requiredIndexEntries) throws IOException { - super.writeStripe(builder, requiredIndexEntries); - valueStream.flush(); - scaleStream.flush(); - recordPosition(rowIndexPosition); - } - - @Override - void recordPosition(PositionRecorder recorder) throws IOException { - super.recordPosition(recorder); - valueStream.getPosition(recorder); - scaleStream.getPosition(recorder); - } - } - private static class StructTreeWriter extends TreeWriter { private final List fields; StructTreeWriter(int columnId, @@ -1693,192 +1561,16 @@ private static class StructTreeWriter extends TreeWriter { } @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - StructObjectInspector insp = (StructObjectInspector) inspector; - for(int i = 0; i < fields.size(); ++i) { - StructField field = fields.get(i); - TreeWriter writer = childrenWriters[i]; - writer.write(insp.getStructFieldData(obj, field)); - } - } - } - - @Override - void writeStripe(OrcProto.StripeFooter.Builder builder, - int requiredIndexEntries) throws IOException { - super.writeStripe(builder, requiredIndexEntries); - for(TreeWriter child: childrenWriters) { - child.writeStripe(builder, requiredIndexEntries); - } - recordPosition(rowIndexPosition); - } - } - - private static class ListTreeWriter extends TreeWriter { - private final IntegerWriter lengths; - private final boolean isDirectV2; - - ListTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); - this.isDirectV2 = isNewWriteFormat(writer); - ListObjectInspector listObjectInspector = (ListObjectInspector) inspector; - childrenWriters = new TreeWriter[1]; - childrenWriters[0] = - createTreeWriter(listObjectInspector.getListElementObjectInspector(), - writer, true); - lengths = createIntegerWriter(writer.createStream(columnId, - OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); - recordPosition(rowIndexPosition); - } - - @Override - OrcProto.ColumnEncoding getEncoding() { - if (isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); - } - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); - } - - @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - ListObjectInspector insp = (ListObjectInspector) inspector; - int len = insp.getListLength(obj); - lengths.write(len); - if (createBloomFilter) { - bloomFilter.addLong(len); - } - for(int i=0; i < len; ++i) { - childrenWriters[0].write(insp.getListElement(obj, i)); - } - } - } - - @Override - void writeStripe(OrcProto.StripeFooter.Builder builder, - int requiredIndexEntries) throws IOException { - super.writeStripe(builder, requiredIndexEntries); - lengths.flush(); - for(TreeWriter child: childrenWriters) { - child.writeStripe(builder, requiredIndexEntries); - } - recordPosition(rowIndexPosition); - } - - @Override - void recordPosition(PositionRecorder recorder) throws IOException { - super.recordPosition(recorder); - lengths.getPosition(recorder); - } - } - - private static class MapTreeWriter extends TreeWriter { - private final IntegerWriter lengths; - private final boolean isDirectV2; - - MapTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); - this.isDirectV2 = isNewWriteFormat(writer); - MapObjectInspector insp = (MapObjectInspector) inspector; - childrenWriters = new TreeWriter[2]; - childrenWriters[0] = - createTreeWriter(insp.getMapKeyObjectInspector(), writer, true); - childrenWriters[1] = - createTreeWriter(insp.getMapValueObjectInspector(), writer, true); - lengths = createIntegerWriter(writer.createStream(columnId, - OrcProto.Stream.Kind.LENGTH), false, isDirectV2, writer); - recordPosition(rowIndexPosition); - } - - @Override - OrcProto.ColumnEncoding getEncoding() { - if (isDirectV2) { - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); - } - return OrcProto.ColumnEncoding.newBuilder() - .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); - } - - @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - MapObjectInspector insp = (MapObjectInspector) inspector; - // this sucks, but it will have to do until we can get a better - // accessor in the MapObjectInspector. - Map valueMap = insp.getMap(obj); - lengths.write(valueMap.size()); - if (createBloomFilter) { - bloomFilter.addLong(valueMap.size()); - } - for(Map.Entry entry: valueMap.entrySet()) { - childrenWriters[0].write(entry.getKey()); - childrenWriters[1].write(entry.getValue()); - } - } - } - - @Override - void writeStripe(OrcProto.StripeFooter.Builder builder, - int requiredIndexEntries) throws IOException { - super.writeStripe(builder, requiredIndexEntries); - lengths.flush(); - for(TreeWriter child: childrenWriters) { - child.writeStripe(builder, requiredIndexEntries); - } - recordPosition(rowIndexPosition); - } - - @Override - void recordPosition(PositionRecorder recorder) throws IOException { - super.recordPosition(recorder); - lengths.getPosition(recorder); - } - } - - private static class UnionTreeWriter extends TreeWriter { - private final RunLengthByteWriter tags; - - UnionTreeWriter(int columnId, - ObjectInspector inspector, - StreamFactory writer, - boolean nullable) throws IOException { - super(columnId, inspector, writer, nullable); - UnionObjectInspector insp = (UnionObjectInspector) inspector; - List choices = insp.getObjectInspectors(); - childrenWriters = new TreeWriter[choices.size()]; - for(int i=0; i < childrenWriters.length; ++i) { - childrenWriters[i] = createTreeWriter(choices.get(i), writer, true); - } - tags = - new RunLengthByteWriter(writer.createStream(columnId, - OrcProto.Stream.Kind.DATA)); - recordPosition(rowIndexPosition); + void write(Datum datum) throws IOException { } - @Override - void write(Object obj) throws IOException { - super.write(obj); - if (obj != null) { - UnionObjectInspector insp = (UnionObjectInspector) inspector; - byte tag = insp.getTag(obj); - tags.write(tag); - if (createBloomFilter) { - bloomFilter.addLong(tag); + void writeTuple(Tuple tuple) throws IOException { + super.write(tuple); + if (tuple != null) { + for(int i = 0; i < fields.size(); ++i) { + TreeWriter writer = childrenWriters[i]; + writer.write(tuple.get(i)); } - childrenWriters[tag].write(insp.getField(obj)); } } @@ -1886,18 +1578,11 @@ void write(Object obj) throws IOException { void writeStripe(OrcProto.StripeFooter.Builder builder, int requiredIndexEntries) throws IOException { super.writeStripe(builder, requiredIndexEntries); - tags.flush(); for(TreeWriter child: childrenWriters) { child.writeStripe(builder, requiredIndexEntries); } recordPosition(rowIndexPosition); } - - @Override - void recordPosition(PositionRecorder recorder) throws IOException { - super.recordPosition(recorder); - tags.getPosition(recorder); - } } private static TreeWriter createTreeWriter(ObjectInspector inspector, @@ -1938,12 +1623,6 @@ private static TreeWriter createTreeWriter(ObjectInspector inspector, case TIMESTAMP: return new TimestampTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); - case DATE: - return new DateTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); - case DECIMAL: - return new DecimalTreeWriter(streamFactory.getNextColumnId(), - inspector, streamFactory, nullable); default: throw new IllegalArgumentException("Bad primitive category " + ((PrimitiveObjectInspector) inspector).getPrimitiveCategory()); @@ -1951,15 +1630,6 @@ private static TreeWriter createTreeWriter(ObjectInspector inspector, case STRUCT: return new StructTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); - case MAP: - return new MapTreeWriter(streamFactory.getNextColumnId(), inspector, - streamFactory, nullable); - case LIST: - return new ListTreeWriter(streamFactory.getNextColumnId(), inspector, - streamFactory, nullable); - case UNION: - return new UnionTreeWriter(streamFactory.getNextColumnId(), inspector, - streamFactory, nullable); default: throw new IllegalArgumentException("Bad category: " + inspector.getCategory()); @@ -2366,10 +2036,9 @@ public synchronized void addUserMetadata(String name, ByteBuffer value) { userMetadata.put(name, ByteString.copyFrom(value)); } - @Override - public void addRow(Object row) throws IOException { + public void addTuple(Tuple tuple) throws IOException { synchronized (this) { - treeWriter.write(row); + ((StructTreeWriter)treeWriter).writeTuple(tuple); rowsInStripe += 1; if (buildIndex) { rowsInIndex += 1; @@ -2379,7 +2048,6 @@ public void addRow(Object row) throws IOException { } } } - memoryManager.addedRow(); } @Override diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java index 9c91f09b1e..362b7aa106 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java @@ -29,11 +29,18 @@ import org.apache.tajo.catalog.proto.CatalogProtos; import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.conf.TajoConf; +import org.apache.tajo.datum.Int2Datum; +import org.apache.tajo.datum.Int4Datum; +import org.apache.tajo.datum.TextDatum; import org.apache.tajo.datum.TimestampDatum; import org.apache.tajo.storage.Tuple; +import org.apache.tajo.storage.VTuple; import org.apache.tajo.storage.fragment.FileFragment; import org.apache.tajo.storage.fragment.Fragment; import org.apache.tajo.storage.orc.objectinspector.ObjectInspectorFactory; +import org.apache.tajo.storage.thirdparty.orc.OrcFile; +import org.apache.tajo.storage.thirdparty.orc.CompressionKind; +import org.apache.tajo.storage.thirdparty.orc.Writer; import org.apache.tajo.util.KeyValueSet; import org.junit.After; import org.junit.Before; @@ -48,14 +55,24 @@ public class TestOrc { private OrcScanner orcScanner; + private static Configuration conf = new TajoConf(); + private static FileSystem fs; + + static { + try { + fs = FileSystem.getLocal(conf); + } catch (IOException e) { + e.printStackTrace(); + } + } + public static Path getResourcePath(String path, String suffix) { URL resultBaseURL = ClassLoader.getSystemResource(path); return new Path(resultBaseURL.toString(), suffix); } - private static FileFragment getFileFragment(Configuration conf, String fileName) throws IOException { + private static FileFragment getFileFragment(String fileName) throws IOException { Path tablePath = new Path(getResourcePath("dataset", "."), fileName); - FileSystem fs = FileSystem.getLocal(conf); FileStatus status = fs.getFileStatus(tablePath); return new FileFragment("table", tablePath, 0, status.getLen()); } @@ -69,11 +86,9 @@ public void setup() throws IOException { schema.addColumn("unixtimestamp", TajoDataTypes.Type.TEXT); schema.addColumn("faketime", TajoDataTypes.Type.TIMESTAMP); - Configuration conf = new TajoConf(); - TableMeta meta = new TableMeta(CatalogProtos.StoreType.ORC, new KeyValueSet()); - Fragment fragment = getFileFragment(conf, "u_data_20.orc"); + Fragment fragment = getFileFragment("u_data_20.orc"); orcScanner = new OrcScanner(conf, schema, meta, fragment); @@ -114,6 +129,40 @@ public void testWrite() { StructField midField = fieldList.get(0); assertEquals("movieid", midField.getFieldName()); + + Path writePath = new Path(getResourcePath("dataset", "."), "temp_test.orc"); + + try { + if (fs.exists(writePath)) { + fs.delete(writePath); + } + + Writer orcWriter = OrcFile.createWriter(fs, writePath, conf, structOI, 1000, CompressionKind.NONE, 100, 1000); + + Tuple tuple = new VTuple(schema.size()); + tuple.put(0, new Int4Datum(100)); + tuple.put(1, new Int2Datum((short)7)); + tuple.put(2, new TextDatum("good")); + tuple.put(3, new TimestampDatum(System.currentTimeMillis() * 1000)); + + orcWriter.addTuple(tuple); + + orcWriter.close(); + + TableMeta meta = new TableMeta(CatalogProtos.StoreType.ORC, new KeyValueSet()); + Fragment fragment = getFileFragment("temp_test.orc"); + OrcScanner orcScanner = new OrcScanner(conf, schema, meta, fragment); + orcScanner.init(); + + tuple = orcScanner.next(); + + assertEquals(100, tuple.getInt4(0)); + + orcScanner.close(); + + } catch (IOException e) { + e.printStackTrace(); + } } @After From 0e1bb5f7a1f992d2bf37f3fb1be3973906f36563 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Tue, 7 Jul 2015 17:48:10 +0900 Subject: [PATCH 103/141] initial orc appender --- .../apache/tajo/storage/StorageConstants.java | 17 +++ .../apache/tajo/storage/orc/OrcAppender.java | 108 ++++++++++++++++++ 2 files changed, 125 insertions(+) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java diff --git a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java index 9471632c2d..5fc51b1afa 100644 --- a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java +++ b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java @@ -81,9 +81,26 @@ public class StorageConstants { public static final String DEFAULT_BINARY_SERDE = "org.apache.tajo.storage.BinarySerializerDeserializer"; public static final String DEFAULT_TEXT_SERDE = "org.apache.tajo.storage.TextSerializerDeserializer"; + // ORC file properties ------------------------------------------------- public static final String ORC_MAX_MERGE_DISTANCE = "orc.max.merge.distance"; public static final String DEFAULT_ORC_MAX_MERGE_DISTANCE = "1048576"; // 1MB + public static final String ORC_STRIPE_SIZE = "orc.stripe.size"; + public static final String DEFAULT_ORC_STRIPE_SIZE = "1000"; + + public static final String ORC_COMPRESSION_KIND = "orc.compression.kind"; + public static final String ORC_COMPRESSION_KIND_NONE = "none"; + public static final String ORC_COMPRESSION_KIND_SNAPPY = "snappy"; + public static final String ORC_COMPRESSION_KIND_LZO = "lzo"; + public static final String ORC_COMPRESSION_KIND_ZIP = "zip"; + public static final String DEFAULT_ORC_COMPRESSION_KIND = ORC_COMPRESSION_KIND_NONE; + + public static final String ORC_BUFFER_SIZE = "orc.buffer.size"; + public static final String DEFAULT_ORC_BUFFER_SIZE = "000"; + + public static final String ORC_ROW_INDEX_STRIDE = "orc.rowindex.stride"; + public static final String DEFAULT_ORC_ROW_INDEX_STRIDE = "1000"; + // Parquet file properties ------------------------------------------------- public static final String PARQUET_DEFAULT_BLOCK_SIZE; public static final String PARQUET_DEFAULT_PAGE_SIZE; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java new file mode 100644 index 0000000000..de56e9b937 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java @@ -0,0 +1,108 @@ +package org.apache.tajo.storage.orc; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.tajo.TaskAttemptId; +import org.apache.tajo.catalog.Schema; +import org.apache.tajo.catalog.TableMeta; +import org.apache.tajo.catalog.statistics.TableStats; +import org.apache.tajo.storage.FileAppender; +import org.apache.tajo.storage.StorageConstants; +import org.apache.tajo.storage.TableStatistics; +import org.apache.tajo.storage.Tuple; +import org.apache.tajo.storage.orc.objectinspector.ObjectInspectorFactory; +import org.apache.tajo.storage.thirdparty.orc.CompressionKind; +import org.apache.tajo.storage.thirdparty.orc.OrcFile; +import org.apache.tajo.storage.thirdparty.orc.Writer; + +import java.io.IOException; + +public class OrcAppender extends FileAppender { + private Writer writer; + private TableStatistics stats; + + + public OrcAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, + TableMeta meta, Path workDir) { + super(conf, taskAttemptId, schema, meta, workDir); + } + + @Override + public void init() throws IOException { + writer = OrcFile.createWriter(workDir.getFileSystem(conf), + path, conf, + ObjectInspectorFactory.buildStructObjectInspector(schema), + Integer.parseInt(meta.getOption(StorageConstants.ORC_STRIPE_SIZE, + StorageConstants.DEFAULT_ORC_STRIPE_SIZE)), // default 1000 + getCompressionKind(), // default 'none' + Integer.parseInt(meta.getOption(StorageConstants.ORC_BUFFER_SIZE, + StorageConstants.DEFAULT_ORC_BUFFER_SIZE)), // default 100 + Integer.parseInt(meta.getOption(StorageConstants.ORC_ROW_INDEX_STRIDE, + StorageConstants.DEFAULT_ORC_ROW_INDEX_STRIDE))); // default 1000 + + if (enabledStats) { + this.stats = new TableStatistics(schema); + } + + super.init(); + } + + @Override + public long getOffset() throws IOException { + return 0; + } + + @Override + public void addTuple(Tuple tuple) throws IOException { + if (enabledStats) { + for (int i = 0; i < schema.size(); ++i) { + stats.analyzeField(i, tuple.get(i)); + } + } + writer.addTuple(tuple); + if (enabledStats) { + stats.incrementRow(); + } + } + + @Override + public void flush() throws IOException { + } + + @Override + public void close() throws IOException { + writer.close(); + } + + @Override + public TableStats getStats() { + if (enabledStats) { + return stats.getTableStat(); + } else { + return null; + } + } + + @Override + public long getEstimatedOutputSize() throws IOException { + return writer.getRawDataSize() * writer.getNumberOfRows(); + } + + private CompressionKind getCompressionKind() { + String kindstr = meta.getOption(StorageConstants.ORC_COMPRESSION_KIND, StorageConstants.DEFAULT_ORC_COMPRESSION_KIND); + + if (kindstr.equalsIgnoreCase(StorageConstants.ORC_COMPRESSION_KIND_ZIP)) { + return CompressionKind.ZLIB; + } + + if (kindstr.equalsIgnoreCase(StorageConstants.ORC_COMPRESSION_KIND_SNAPPY)) { + return CompressionKind.SNAPPY; + } + + if (kindstr.equalsIgnoreCase(StorageConstants.ORC_COMPRESSION_KIND_LZO)) { + return CompressionKind.LZO; + } + + return CompressionKind.NONE; + } +} From 0f8f83f625331c88d42e7695110da750fe0ea81c Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 8 Jul 2015 11:38:53 +0900 Subject: [PATCH 104/141] License added --- .../apache/tajo/storage/orc/OrcAppender.java | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java index de56e9b937..301889d3a0 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.tajo.storage.orc; import org.apache.hadoop.conf.Configuration; From f4d74b06bc7105553956067c8d1903b6a494ae95 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 8 Jul 2015 14:42:31 +0900 Subject: [PATCH 105/141] Fix timestamp test --- .../src/test/java/org/apache/tajo/storage/orc/TestOrc.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java index 362b7aa106..c16fcb0bd6 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java @@ -42,6 +42,7 @@ import org.apache.tajo.storage.thirdparty.orc.CompressionKind; import org.apache.tajo.storage.thirdparty.orc.Writer; import org.apache.tajo.util.KeyValueSet; +import org.apache.tajo.util.datetime.DateTimeUtil; import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -143,7 +144,7 @@ public void testWrite() { tuple.put(0, new Int4Datum(100)); tuple.put(1, new Int2Datum((short)7)); tuple.put(2, new TextDatum("good")); - tuple.put(3, new TimestampDatum(System.currentTimeMillis() * 1000)); + tuple.put(3, new TimestampDatum(DateTimeUtil.javaTimeToJulianTime(System.currentTimeMillis()))); orcWriter.addTuple(tuple); From dd924782d0b72830baa471c79e24dc118220bc67 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 8 Jul 2015 14:52:53 +0900 Subject: [PATCH 106/141] Registered orc appender --- .../src/main/resources/storage-default.xml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml index eb72110ee2..0d1f48add9 100644 --- a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml @@ -147,7 +147,7 @@ tajo.storage.appender-handler - text,csv,raw,rcfile,row,parquet,sequencefile,avro,hbase + text,csv,raw,rcfile,row,parquet,orc,sequencefile,avro,hbase @@ -185,6 +185,11 @@ org.apache.tajo.storage.parquet.ParquetAppender + + tajo.storage.appender-handler.orc.class + org.apache.tajo.storage.orc.OrcAppender + + tajo.storage.appender-handler.sequencefile.class org.apache.tajo.storage.sequencefile.SequenceFileAppender From 3ecbbbfdaa3ed7e7cdcfcf8508ecc19e9d80e961 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 8 Jul 2015 18:15:49 +0900 Subject: [PATCH 107/141] Default option bug fixed --- .../org/apache/tajo/storage/StorageConstants.java | 2 +- .../org/apache/tajo/storage/orc/OrcAppender.java | 2 +- .../java/org/apache/tajo/storage/orc/TestOrc.java | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java index 5fc51b1afa..45a1147525 100644 --- a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java +++ b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java @@ -96,7 +96,7 @@ public class StorageConstants { public static final String DEFAULT_ORC_COMPRESSION_KIND = ORC_COMPRESSION_KIND_NONE; public static final String ORC_BUFFER_SIZE = "orc.buffer.size"; - public static final String DEFAULT_ORC_BUFFER_SIZE = "000"; + public static final String DEFAULT_ORC_BUFFER_SIZE = "1024"; public static final String ORC_ROW_INDEX_STRIDE = "orc.rowindex.stride"; public static final String DEFAULT_ORC_ROW_INDEX_STRIDE = "1000"; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java index 301889d3a0..d721a732da 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java @@ -54,7 +54,7 @@ public void init() throws IOException { StorageConstants.DEFAULT_ORC_STRIPE_SIZE)), // default 1000 getCompressionKind(), // default 'none' Integer.parseInt(meta.getOption(StorageConstants.ORC_BUFFER_SIZE, - StorageConstants.DEFAULT_ORC_BUFFER_SIZE)), // default 100 + StorageConstants.DEFAULT_ORC_BUFFER_SIZE)), // default 1024 bytes Integer.parseInt(meta.getOption(StorageConstants.ORC_ROW_INDEX_STRIDE, StorageConstants.DEFAULT_ORC_ROW_INDEX_STRIDE))); // default 1000 diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java index c16fcb0bd6..e8833bad36 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java @@ -38,9 +38,6 @@ import org.apache.tajo.storage.fragment.FileFragment; import org.apache.tajo.storage.fragment.Fragment; import org.apache.tajo.storage.orc.objectinspector.ObjectInspectorFactory; -import org.apache.tajo.storage.thirdparty.orc.OrcFile; -import org.apache.tajo.storage.thirdparty.orc.CompressionKind; -import org.apache.tajo.storage.thirdparty.orc.Writer; import org.apache.tajo.util.KeyValueSet; import org.apache.tajo.util.datetime.DateTimeUtil; import org.junit.After; @@ -138,7 +135,11 @@ public void testWrite() { fs.delete(writePath); } - Writer orcWriter = OrcFile.createWriter(fs, writePath, conf, structOI, 1000, CompressionKind.NONE, 100, 1000); + TableMeta meta = new TableMeta(CatalogProtos.StoreType.ORC, new KeyValueSet()); + + OrcAppender appender = new OrcAppender(conf, null, schema, meta, writePath); + + appender.init(); Tuple tuple = new VTuple(schema.size()); tuple.put(0, new Int4Datum(100)); @@ -146,11 +147,10 @@ public void testWrite() { tuple.put(2, new TextDatum("good")); tuple.put(3, new TimestampDatum(DateTimeUtil.javaTimeToJulianTime(System.currentTimeMillis()))); - orcWriter.addTuple(tuple); + appender.addTuple(tuple); - orcWriter.close(); + appender.close(); - TableMeta meta = new TableMeta(CatalogProtos.StoreType.ORC, new KeyValueSet()); Fragment fragment = getFileFragment("temp_test.orc"); OrcScanner orcScanner = new OrcScanner(conf, schema, meta, fragment); orcScanner.init(); From 413515643310c7b41009ccad45976693d4091942 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 9 Jul 2015 11:51:38 +0900 Subject: [PATCH 108/141] DateDatum is modified based on julian date --- .../java/org/apache/tajo/datum/DateDatum.java | 84 ++++++++----------- 1 file changed, 34 insertions(+), 50 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java index 093a8be003..7cf0896354 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java @@ -25,6 +25,7 @@ import org.apache.tajo.exception.InvalidOperationException; import org.apache.tajo.util.Bytes; import org.apache.tajo.util.datetime.DateTimeConstants.DateStyle; +import org.apache.tajo.util.datetime.DateTimeFormat; import org.apache.tajo.util.datetime.DateTimeUtil; import org.apache.tajo.util.datetime.TimeMeta; @@ -32,69 +33,60 @@ public class DateDatum extends Datum { public static final int SIZE = 4; // Dates are stored in UTC. - final int year; - final int monthOfYear; - final int dayOfMonth; + private int jdate; public DateDatum(int value) { - this(DateTimeUtil.j2date(value)); + super(TajoDataTypes.Type.DATE); + + jdate = value; } public DateDatum(TimeMeta tm) { super(TajoDataTypes.Type.DATE); - year = tm.years; - monthOfYear = tm.monthOfYear; - dayOfMonth = tm.dayOfMonth; + jdate = DateTimeUtil.date2j(tm.years, tm.monthOfYear, tm.dayOfMonth); } public TimeMeta asTimeMeta() { TimeMeta tm = new TimeMeta(); - tm.years = year; - tm.monthOfYear = monthOfYear; - tm.dayOfMonth = dayOfMonth; + DateTimeUtil.j2date(jdate, tm); + return tm; } public int getCenturyOfEra() { - TimeMeta tm = asTimeMeta(); - return tm.getCenturyOfEra(); + return asTimeMeta().getCenturyOfEra(); } public int getYear() { - return year; + return asTimeMeta().years; } public int getWeekyear() { - TimeMeta tm = asTimeMeta(); - return tm.getWeekyear(); + return asTimeMeta().getWeekyear(); } public int getMonthOfYear() { - return monthOfYear; + return asTimeMeta().monthOfYear; } public int getDayOfYear() { - TimeMeta tm = asTimeMeta(); - return tm.getDayOfYear(); + return asTimeMeta().getDayOfYear(); } public int getDayOfWeek() { - TimeMeta tm = asTimeMeta(); - return tm.getDayOfWeek(); + return asTimeMeta().getDayOfWeek(); } public int getISODayOfWeek() { - TimeMeta tm = asTimeMeta(); - return tm.getISODayOfWeek(); + return asTimeMeta().getISODayOfWeek(); } public int getWeekOfYear() { - TimeMeta tm = asTimeMeta(); - return tm.getWeekOfYear(); + return asTimeMeta().getWeekOfYear(); } public int getDayOfMonth() { - return dayOfMonth; + return asTimeMeta().dayOfMonth; } @Override @@ -156,9 +148,7 @@ public Datum minus(Datum datum) { } case DATE: { DateDatum d = (DateDatum) datum; - int day1 = DateTimeUtil.date2j(year, monthOfYear, dayOfMonth); - int day2 = DateTimeUtil.date2j(d.year, d.monthOfYear, d.dayOfMonth); - return new Int4Datum(day1 - day2); + return new Int4Datum(jdate - d.jdate); } default: throw new InvalidOperationException(datum.type()); @@ -167,16 +157,12 @@ public Datum minus(Datum datum) { @Override public int asInt4() { - return encode(); - } - - private int encode() { - return DateTimeUtil.date2j(year, monthOfYear, dayOfMonth); + return jdate; } @Override public long asInt8() { - return encode(); + return jdate; } @Override @@ -191,7 +177,11 @@ public double asFloat8() { @Override public String asChars() { - return DateTimeUtil.encodeDate(year, monthOfYear, dayOfMonth, DateStyle.ISO_DATES); + return DateTimeUtil.encodeDate(asTimeMeta(), DateStyle.ISO_DATES); + } + + public String toChars(String format) { + return DateTimeFormat.to_char(asTimeMeta(), format); } @Override @@ -201,7 +191,7 @@ public int size() { @Override public byte [] asByteArray() { - return Bytes.toBytes(encode()); + return Bytes.toBytes(jdate); } @Override @@ -219,15 +209,7 @@ public Datum equalsTo(Datum datum) { public int compareTo(Datum datum) { if (datum.type() == TajoDataTypes.Type.DATE) { DateDatum another = (DateDatum) datum; - int compare = Ints.compare(year, another.year); - if (compare != 0) { - return compare; - } - compare = Ints.compare(monthOfYear, another.monthOfYear); - if (compare != 0) { - return compare; - } - return Ints.compare(dayOfMonth, another.dayOfMonth); + return Ints.compare(jdate, another.jdate); } else if (datum.type() == TajoDataTypes.Type.TIMESTAMP) { TimestampDatum another = (TimestampDatum) datum; TimeMeta myMeta, otherMeta; @@ -243,9 +225,10 @@ public int compareTo(Datum datum) { @Override public boolean equals(Object obj) { + TimeMeta tm = asTimeMeta(); if (obj instanceof DateDatum) { - DateDatum another = (DateDatum) obj; - return year == another.year && monthOfYear == another.monthOfYear && dayOfMonth == another.dayOfMonth; + TimeMeta another = ((DateDatum) obj).asTimeMeta(); + return tm.years == another.years && tm.monthOfYear == another.monthOfYear && tm.dayOfMonth == another.dayOfMonth; } else { return false; } @@ -253,10 +236,11 @@ public boolean equals(Object obj) { @Override public int hashCode() { + TimeMeta tm = asTimeMeta(); int total = 157; - total = 23 * total + year; - total = 23 * total + monthOfYear; - total = 23 * total + dayOfMonth; + total = 23 * total + tm.years; + total = 23 * total + tm.monthOfYear; + total = 23 * total + tm.dayOfMonth; return total; } From b6346dd00fb64606ba1925fb7dd5d7235049803e Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 9 Jul 2015 15:10:04 +0900 Subject: [PATCH 109/141] DateObjectInspector added --- .../TajoDateObjectInspector.java | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java new file mode 100644 index 0000000000..f12706b8df --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDateObjectInspector.java @@ -0,0 +1,73 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +import java.sql.Date; + +public class TajoDateObjectInspector extends TajoPrimitiveObjectInspector implements DateObjectInspector { + @Override + public PrimitiveTypeInfo getTypeInfo() { + return TypeInfoFactory.dateTypeInfo; + } + + @Override + public PrimitiveCategory getPrimitiveCategory() { + return PrimitiveCategory.DATE; + } + + @Override + public Class getPrimitiveWritableClass() { + return null; + } + + @Override + public DateWritable getPrimitiveWritableObject(Object o) { + return null; + } + + @Override + public Class getJavaPrimitiveClass() { + return null; + } + + @Override + public Date getPrimitiveJavaObject(Object o) { + return null; + } + + @Override + public Object copyObject(Object o) { + return null; + } + + @Override + public boolean preferWritable() { + return false; + } + + @Override + public String getTypeName() { + return "DATE"; + } +} From 4f3a144393b565cdc5d20f87e2d8c7bbbb92c63f Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 9 Jul 2015 15:10:41 +0900 Subject: [PATCH 110/141] default values modified --- .../java/org/apache/tajo/storage/StorageConstants.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java index 45a1147525..2680ac1758 100644 --- a/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java +++ b/tajo-common/src/main/java/org/apache/tajo/storage/StorageConstants.java @@ -86,20 +86,20 @@ public class StorageConstants { public static final String DEFAULT_ORC_MAX_MERGE_DISTANCE = "1048576"; // 1MB public static final String ORC_STRIPE_SIZE = "orc.stripe.size"; - public static final String DEFAULT_ORC_STRIPE_SIZE = "1000"; + public static final String DEFAULT_ORC_STRIPE_SIZE = "67108864"; // 64MB public static final String ORC_COMPRESSION_KIND = "orc.compression.kind"; public static final String ORC_COMPRESSION_KIND_NONE = "none"; public static final String ORC_COMPRESSION_KIND_SNAPPY = "snappy"; public static final String ORC_COMPRESSION_KIND_LZO = "lzo"; - public static final String ORC_COMPRESSION_KIND_ZIP = "zip"; + public static final String ORC_COMPRESSION_KIND_ZIP = "zlip"; public static final String DEFAULT_ORC_COMPRESSION_KIND = ORC_COMPRESSION_KIND_NONE; public static final String ORC_BUFFER_SIZE = "orc.buffer.size"; - public static final String DEFAULT_ORC_BUFFER_SIZE = "1024"; + public static final String DEFAULT_ORC_BUFFER_SIZE = "262144"; // 256KB public static final String ORC_ROW_INDEX_STRIDE = "orc.rowindex.stride"; - public static final String DEFAULT_ORC_ROW_INDEX_STRIDE = "1000"; + public static final String DEFAULT_ORC_ROW_INDEX_STRIDE = "10000"; // Parquet file properties ------------------------------------------------- public static final String PARQUET_DEFAULT_BLOCK_SIZE; From d9035336ecee4abe171e290f3abc4366dbca5b7c Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 9 Jul 2015 15:11:24 +0900 Subject: [PATCH 111/141] default comment removed --- .../org/apache/tajo/storage/orc/OrcAppender.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java index d721a732da..e06c6d3472 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java @@ -47,16 +47,14 @@ public OrcAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schem @Override public void init() throws IOException { - writer = OrcFile.createWriter(workDir.getFileSystem(conf), - path, conf, + writer = OrcFile.createWriter(workDir.getFileSystem(conf), path, conf, ObjectInspectorFactory.buildStructObjectInspector(schema), - Integer.parseInt(meta.getOption(StorageConstants.ORC_STRIPE_SIZE, - StorageConstants.DEFAULT_ORC_STRIPE_SIZE)), // default 1000 - getCompressionKind(), // default 'none' + Long.parseLong(meta.getOption(StorageConstants.ORC_STRIPE_SIZE, + StorageConstants.DEFAULT_ORC_STRIPE_SIZE)), getCompressionKind(), Integer.parseInt(meta.getOption(StorageConstants.ORC_BUFFER_SIZE, - StorageConstants.DEFAULT_ORC_BUFFER_SIZE)), // default 1024 bytes + StorageConstants.DEFAULT_ORC_BUFFER_SIZE)), Integer.parseInt(meta.getOption(StorageConstants.ORC_ROW_INDEX_STRIDE, - StorageConstants.DEFAULT_ORC_ROW_INDEX_STRIDE))); // default 1000 + StorageConstants.DEFAULT_ORC_ROW_INDEX_STRIDE))); if (enabledStats) { this.stats = new TableStatistics(schema); From 96a0be421eace8a9135007f00cb3c1e599d9b66c Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 9 Jul 2015 15:12:10 +0900 Subject: [PATCH 112/141] Date related things modified --- .../ObjectInspectorFactory.java | 4 + .../TajoDoubleObjectInspector.java | 4 +- .../TajoFloatObjectInspector.java | 4 +- .../TajoIntObjectInspector.java | 2 +- .../TajoLongObjectInspector.java | 6 +- .../TajoShortObjectInspector.java | 4 +- .../TajoStringObjectInspector.java | 2 +- .../TajoStructObjectInspector.java | 2 +- .../thirdparty/orc/ColumnStatisticsImpl.java | 17 ++-- .../storage/thirdparty/orc/WriterImpl.java | 77 +++++++++++++++---- 10 files changed, 85 insertions(+), 37 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java index 7dff7fc81c..697be8b63d 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java @@ -63,6 +63,10 @@ public static ObjectInspector buildObjectInspectorByType(TajoDataTypes.Type data oi = new TajoTimestampObjectInspector(); break; + case DATE: + oi = new TajoDateObjectInspector(); + break; + default: throw new UnsupportedException(dataType.name()+" is not supported yet in OrcAppender"); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java index c28553500d..6dc1f8c95c 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoDoubleObjectInspector.java @@ -51,7 +51,7 @@ public Object getPrimitiveWritableObject(Object o) { @Override public Class getJavaPrimitiveClass() { - return null; + return Double.class; } @Override @@ -71,6 +71,6 @@ public boolean preferWritable() { @Override public String getTypeName() { - return "FLOAT8"; + return "DOUBLE"; } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java index d1a7ad3f5b..8f4ffdeead 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoFloatObjectInspector.java @@ -52,7 +52,7 @@ public Object getPrimitiveWritableObject(Object o) { @Override public Class getJavaPrimitiveClass() { - return null; + return Float.class; } @Override @@ -72,6 +72,6 @@ public boolean preferWritable() { @Override public String getTypeName() { - return "FLOAT4"; + return "FLOAT"; } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java index 9718fef218..a0c2209678 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoIntObjectInspector.java @@ -71,6 +71,6 @@ public boolean preferWritable() { @Override public String getTypeName() { - return "INT4"; + return "INT"; } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java index bc6610a31a..b30b3338f6 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoLongObjectInspector.java @@ -36,7 +36,7 @@ public PrimitiveTypeInfo getTypeInfo() { @Override public PrimitiveCategory getPrimitiveCategory() { - return PrimitiveCategory.SHORT; + return PrimitiveCategory.LONG; } @Override @@ -51,7 +51,7 @@ public Object getPrimitiveWritableObject(Object o) { @Override public Class getJavaPrimitiveClass() { - return Integer.class; + return Long.class; } @Override @@ -71,6 +71,6 @@ public boolean preferWritable() { @Override public String getTypeName() { - return "INT8"; + return "LONG"; } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java index 08ed694662..d32bee172a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoShortObjectInspector.java @@ -51,7 +51,7 @@ public Object getPrimitiveWritableObject(Object o) { @Override public Class getJavaPrimitiveClass() { - return Integer.class; + return Short.class; } @Override @@ -71,6 +71,6 @@ public boolean preferWritable() { @Override public String getTypeName() { - return "INT2"; + return "SHORT"; } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java index a980f7cf89..b9331da6cd 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStringObjectInspector.java @@ -66,6 +66,6 @@ public boolean preferWritable() { @Override public String getTypeName() { - return "TEXT"; + return "STRING"; } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java index c2ac1954ec..177c5553a0 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java @@ -112,7 +112,7 @@ public List getStructFieldsDataAsList(Object o) { @Override public String getTypeName() { - return "struct"; + return "STRUCT"; } @Override diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java index 78cce1efa6..853ad6f8db 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java @@ -22,6 +22,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.tajo.datum.BlobDatum; +import org.apache.tajo.datum.DateDatum; import java.sql.Date; import java.sql.Timestamp; @@ -689,14 +690,14 @@ void reset() { } @Override - void updateDate(DateWritable value) { + void updateDate(int daysSinceEpoch) { if (minimum == null) { - minimum = value.getDays(); - maximum = value.getDays(); - } else if (minimum > value.getDays()) { - minimum = value.getDays(); - } else if (maximum < value.getDays()) { - maximum = value.getDays(); + minimum = daysSinceEpoch; + maximum = daysSinceEpoch; + } else if (minimum > daysSinceEpoch) { + minimum = daysSinceEpoch; + } else if (maximum < daysSinceEpoch) { + maximum = daysSinceEpoch; } } @@ -914,7 +915,7 @@ void updateDecimal(HiveDecimal value) { throw new UnsupportedOperationException("Can't update decimal"); } - void updateDate(DateWritable value) { + void updateDate(int days) { throw new UnsupportedOperationException("Can't update date"); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index 7683995165..63c547902b 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -31,6 +31,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.IOConstants; +import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.tajo.datum.*; import org.apache.tajo.storage.Tuple; @@ -966,9 +967,6 @@ void recordPosition(PositionRecorder recorder) throws IOException { private static class IntegerTreeWriter extends TreeWriter { private final IntegerWriter writer; - private final ShortObjectInspector shortInspector; - private final IntObjectInspector intInspector; - private final LongObjectInspector longInspector; private boolean isDirectV2 = true; IntegerTreeWriter(int columnId, @@ -980,20 +978,6 @@ private static class IntegerTreeWriter extends TreeWriter { OrcProto.Stream.Kind.DATA); this.isDirectV2 = isNewWriteFormat(writer); this.writer = createIntegerWriter(out, true, isDirectV2, writer); - if (inspector instanceof IntObjectInspector) { - intInspector = (IntObjectInspector) inspector; - shortInspector = null; - longInspector = null; - } else { - intInspector = null; - if (inspector instanceof LongObjectInspector) { - longInspector = (LongObjectInspector) inspector; - shortInspector = null; - } else { - shortInspector = (ShortObjectInspector) inspector; - longInspector = null; - } - } recordPosition(rowIndexPosition); } @@ -1542,6 +1526,62 @@ void recordPosition(PositionRecorder recorder) throws IOException { } } + private static class DateTreeWriter extends TreeWriter { + private final IntegerWriter writer; + private final boolean isDirectV2; + + DateTreeWriter(int columnId, + ObjectInspector inspector, + StreamFactory writer, + boolean nullable) throws IOException { + super(columnId, inspector, writer, nullable); + OutStream out = writer.createStream(id, + OrcProto.Stream.Kind.DATA); + this.isDirectV2 = isNewWriteFormat(writer); + this.writer = createIntegerWriter(out, true, isDirectV2, writer); + recordPosition(rowIndexPosition); + } + + @Override + void write(Datum datum) throws IOException { + final int DAYS_FROM_JULIAN_TO_EPOCH = 2440588; + super.write(datum); + if (datum != null && datum.isNotNull()) { + int daysSinceEpoch = datum.asInt4() - DAYS_FROM_JULIAN_TO_EPOCH; + // Using the Writable here as it's used directly for writing as well as for stats. + indexStatistics.updateDate(daysSinceEpoch); + writer.write(daysSinceEpoch); + if (createBloomFilter) { + bloomFilter.addLong(daysSinceEpoch); + } + } + } + + @Override + void writeStripe(OrcProto.StripeFooter.Builder builder, + int requiredIndexEntries) throws IOException { + super.writeStripe(builder, requiredIndexEntries); + writer.flush(); + recordPosition(rowIndexPosition); + } + + @Override + void recordPosition(PositionRecorder recorder) throws IOException { + super.recordPosition(recorder); + writer.getPosition(recorder); + } + + @Override + OrcProto.ColumnEncoding getEncoding() { + if (isDirectV2) { + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT_V2).build(); + } + return OrcProto.ColumnEncoding.newBuilder() + .setKind(OrcProto.ColumnEncoding.Kind.DIRECT).build(); + } + } + private static class StructTreeWriter extends TreeWriter { private final List fields; StructTreeWriter(int columnId, @@ -1623,6 +1663,9 @@ private static TreeWriter createTreeWriter(ObjectInspector inspector, case TIMESTAMP: return new TimestampTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); + case DATE: + return new DateTreeWriter(streamFactory.getNextColumnId(), + inspector, streamFactory, nullable); default: throw new IllegalArgumentException("Bad primitive category " + ((PrimitiveObjectInspector) inspector).getPrimitiveCategory()); From 910b15e3334f8a17071cabd2e92c85ca907f4eb5 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 9 Jul 2015 16:39:43 +0900 Subject: [PATCH 113/141] Missed flushing stripe added --- .../java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java | 1 + 1 file changed, 1 insertion(+) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index 63c547902b..ecc2e14cc7 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -2091,6 +2091,7 @@ public void addTuple(Tuple tuple) throws IOException { } } } + memoryManager.addedRow(); } @Override From ec4991dea768f3d6b40a082e5b0a54ea7bb186cf Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 9 Jul 2015 17:41:00 +0900 Subject: [PATCH 114/141] date type added --- .../src/main/java/org/apache/tajo/storage/orc/ORCScanner.java | 3 +++ .../org/apache/tajo/storage/thirdparty/orc/WriterImpl.java | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java index ab74cdfc4a..fab1a97611 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java @@ -270,6 +270,9 @@ private Datum createValueDatum(Vector vector, TajoDataTypes.DataType type) { case NULL_TYPE: return NullDatum.get(); + case DATE: + return new DateDatum((int)((LongVector)vector).vector[currentPosInBatch] + DateTimeUtil.DAYS_FROM_JULIAN_TO_EPOCH); + default: throw new UnsupportedException("This data type is not supported currently: "+type.toString()); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index ecc2e14cc7..2fdbc59d0e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -1544,10 +1544,9 @@ private static class DateTreeWriter extends TreeWriter { @Override void write(Datum datum) throws IOException { - final int DAYS_FROM_JULIAN_TO_EPOCH = 2440588; super.write(datum); if (datum != null && datum.isNotNull()) { - int daysSinceEpoch = datum.asInt4() - DAYS_FROM_JULIAN_TO_EPOCH; + int daysSinceEpoch = datum.asInt4() - DateTimeUtil.DAYS_FROM_JULIAN_TO_EPOCH; // Using the Writable here as it's used directly for writing as well as for stats. indexStatistics.updateDate(daysSinceEpoch); writer.write(daysSinceEpoch); From f19bb5f65d2fd4bc7271c128ed87ec421e6a5b47 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 15 Jul 2015 18:38:09 +0900 Subject: [PATCH 115/141] TestStorages passed --- .../tajo/datum/ProtobufDatumFactory.java | 22 +++++- .../TextFieldSerializerDeserializer.java | 2 +- .../storage/BinarySerializerDeserializer.java | 6 +- .../src/main/resources/storage-default.xml | 2 +- .../java/org/apache/tajo/storage/RawFile.java | 5 +- .../apache/tajo/storage/orc/ORCScanner.java | 12 +++- .../ObjectInspectorFactory.java | 17 ++++- .../TajoBlobObjectInspector.java | 64 +++++++++++++++++ .../TajoBooleanObjectInspector.java | 58 ++++++++++++++++ .../TajoNullObjectInspector.java | 69 +++++++++++++++++++ .../text/TextFieldSerializerDeserializer.java | 2 +- .../thirdparty/orc/ColumnStatisticsImpl.java | 5 +- .../storage/thirdparty/orc/WriterImpl.java | 13 ++-- .../org/apache/tajo/storage/TestStorages.java | 1 + .../src/test/resources/storage-default.xml | 18 ++++- 15 files changed, 269 insertions(+), 27 deletions(-) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/ProtobufDatumFactory.java b/tajo-common/src/main/java/org/apache/tajo/datum/ProtobufDatumFactory.java index 0d585a4dda..a30e52c2fa 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/ProtobufDatumFactory.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/ProtobufDatumFactory.java @@ -21,6 +21,7 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Maps; import com.google.protobuf.GeneratedMessage; +import com.google.protobuf.InvalidProtocolBufferException; import com.google.protobuf.Message; import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.datum.protobuf.ProtobufJsonFormat; @@ -60,14 +61,30 @@ public T newBuilder() { return (T) builder; } - public ProtobufDatum createDatum(Message.Builder builder) { + public static ProtobufDatum createDatum(Message.Builder builder) { return createDatum(builder.build()); } - public ProtobufDatum createDatum(Message message) { + public static ProtobufDatum createDatum(Message message) { return new ProtobufDatum(message); } + public static ProtobufDatum createDatum(String className, byte [] bytes, int offset, int length) + throws InvalidProtocolBufferException { + ProtobufDatumFactory factory = get(className); + Message.Builder builder = factory.newBuilder(); + builder.mergeFrom(bytes, offset, length); + return createDatum(builder); + } + + public static Datum createDatum(DataType type, byte[] bytes) + throws InvalidProtocolBufferException { + ProtobufDatumFactory factory = get(type); + Message.Builder builder = factory.newBuilder(); + builder.mergeFrom(bytes); + return createDatum(builder); + } + public static ProtobufDatumFactory get(DataType dataType) { Preconditions.checkArgument(dataType.getType() == TajoDataTypes.Type.PROTOBUF, "ProtobufDatumFactory only can accepts Protocol Buffer Datum Type."); @@ -88,4 +105,5 @@ public static ProtobufDatumFactory get(String className) { public static String toJson(Message message) { return protobufFormatter.printToString(message); } + } diff --git a/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/TextFieldSerializerDeserializer.java b/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/TextFieldSerializerDeserializer.java index c22a0f2ac9..3ad21c3a3d 100644 --- a/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/TextFieldSerializerDeserializer.java +++ b/tajo-plan/src/main/java/org/apache/tajo/plan/function/stream/TextFieldSerializerDeserializer.java @@ -236,7 +236,7 @@ public Datum deserialize(ByteBuf buf, TajoDataTypes.DataType dataType, ByteBuf n byte[] bytes = new byte[buf.readableBytes()]; buf.readBytes(bytes); protobufJsonFormat.merge(bytes, builder); - datum = factory.createDatum(builder.build()); + datum = ProtobufDatumFactory.createDatum(builder.build()); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); diff --git a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BinarySerializerDeserializer.java b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BinarySerializerDeserializer.java index 2cccb698c9..ae1e68d84f 100644 --- a/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BinarySerializerDeserializer.java +++ b/tajo-storage/tajo-storage-common/src/main/java/org/apache/tajo/storage/BinarySerializerDeserializer.java @@ -160,10 +160,8 @@ public Datum deserialize(int index, byte[] bytes, int offset, int length, byte[] break; } case PROTOBUF: { - ProtobufDatumFactory factory = ProtobufDatumFactory.get(column.getDataType().getCode()); - Message.Builder builder = factory.newBuilder(); - builder.mergeFrom(bytes, offset, length); - datum = factory.createDatum(builder); + datum = ProtobufDatumFactory.createDatum(column.getDataType().getCode(), + bytes, offset, length); break; } case INET4: diff --git a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml index 0d1f48add9..aa2c4c7eef 100644 --- a/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/main/resources/storage-default.xml @@ -187,7 +187,7 @@ tajo.storage.appender-handler.orc.class - org.apache.tajo.storage.orc.OrcAppender + org.apache.tajo.storage.orc.ORCAppender diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java index 3b655be36f..d6f52cc33a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/RawFile.java @@ -335,10 +335,7 @@ public Tuple next() throws IOException { byte [] rawBytes = new byte[len]; buffer.get(rawBytes); - ProtobufDatumFactory factory = ProtobufDatumFactory.get(columnTypes[i]); - Message.Builder builder = factory.newBuilder(); - builder.mergeFrom(rawBytes); - tuple.put(i, factory.createDatum(builder.build())); + tuple.put(i, ProtobufDatumFactory.createDatum(columnTypes[i], rawBytes)); break; } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java index fab1a97611..cca83de57e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java @@ -247,6 +247,15 @@ private Datum createValueDatum(Vector vector, TajoDataTypes.DataType type) { return DatumFactory.createBlob(((SliceVector) vector).vector[currentPosInBatch].getBytes()); + case PROTOBUF: + try { + return ProtobufDatumFactory.createDatum(type, + ((SliceVector) vector).vector[currentPosInBatch].getBytes()); + } catch (InvalidProtocolBufferException e) { + LOG.error("ERROR", e); + return NullDatum.get(); + } + case TIMESTAMP: if (((LongVector) vector).isNull[currentPosInBatch]) return NullDatum.get(); @@ -270,9 +279,6 @@ private Datum createValueDatum(Vector vector, TajoDataTypes.DataType type) { case NULL_TYPE: return NullDatum.get(); - case DATE: - return new DateDatum((int)((LongVector)vector).vector[currentPosInBatch] + DateTimeUtil.DAYS_FROM_JULIAN_TO_EPOCH); - default: throw new UnsupportedException("This data type is not supported currently: "+type.toString()); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java index 697be8b63d..061ba0d034 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/ObjectInspectorFactory.java @@ -32,13 +32,18 @@ public static StructObjectInspector buildStructObjectInspector(Schema schema) { } public static ObjectInspector buildObjectInspectorByType(TajoDataTypes.Type dataType) throws UnsupportedException { - ObjectInspector oi; + ObjectInspector oi = null; switch(dataType) { + case BOOLEAN: + oi = new TajoBooleanObjectInspector(); + break; + case INT2: oi = new TajoShortObjectInspector(); break; + case INET4: case INT4: oi = new TajoIntObjectInspector(); break; @@ -56,6 +61,7 @@ public static ObjectInspector buildObjectInspectorByType(TajoDataTypes.Type data break; case TEXT: + case CHAR: oi = new TajoStringObjectInspector(); break; @@ -67,6 +73,15 @@ public static ObjectInspector buildObjectInspectorByType(TajoDataTypes.Type data oi = new TajoDateObjectInspector(); break; + case BLOB: + case PROTOBUF: + oi = new TajoBlobObjectInspector(); + break; + + case NULL_TYPE: + oi = new TajoNullObjectInspector(); + break; + default: throw new UnsupportedException(dataType.name()+" is not supported yet in OrcAppender"); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java new file mode 100644 index 0000000000..0b2d0a70b0 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java @@ -0,0 +1,64 @@ +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.hadoop.io.BytesWritable; +import org.apache.tajo.datum.Datum; + +public class TajoBlobObjectInspector extends TajoPrimitiveObjectInspector implements BinaryObjectInspector { + @Override + public PrimitiveTypeInfo getTypeInfo() { + return TypeInfoFactory.binaryTypeInfo; + } + + @Override + public PrimitiveCategory getPrimitiveCategory() { + return PrimitiveCategory.BINARY; + } + + @Override + public Class getPrimitiveWritableClass() { + return null; + } + + @Override + public BytesWritable getPrimitiveWritableObject(Object o) { + return null; + } + + @Override + public Class getJavaPrimitiveClass() { + return byte [].class; + } + + @Override + public byte[] getPrimitiveJavaObject(Object o) { + return ((Datum)o).asByteArray(); + } + + @Override + public Object copyObject(Object o) { + return null; + } + + @Override + public boolean preferWritable() { + return false; + } + + @Override + public int precision() { + return 0; + } + + @Override + public int scale() { + return 0; + } + + @Override + public String getTypeName() { + return "BINARY"; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java new file mode 100644 index 0000000000..4fa65fd59b --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java @@ -0,0 +1,58 @@ +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; +import org.apache.tajo.datum.Datum; + +public class TajoBooleanObjectInspector extends TajoPrimitiveObjectInspector implements BooleanObjectInspector { + @Override + public boolean get(Object o) { + return ((Datum)o).asBool(); + } + + @Override + public PrimitiveTypeInfo getTypeInfo() { + return TypeInfoFactory.booleanTypeInfo; + } + + @Override + public PrimitiveCategory getPrimitiveCategory() { + return PrimitiveCategory.BOOLEAN; + } + + @Override + public Class getPrimitiveWritableClass() { + return null; + } + + @Override + public Object getPrimitiveWritableObject(Object o) { + return null; + } + + @Override + public Class getJavaPrimitiveClass() { + return Boolean.class; + } + + @Override + public Object getPrimitiveJavaObject(Object o) { + return null; + } + + @Override + public Object copyObject(Object o) { + return null; + } + + @Override + public boolean preferWritable() { + return false; + } + + @Override + public String getTypeName() { + return "BOOLEAN"; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java new file mode 100644 index 0000000000..49998ce30e --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoNullObjectInspector.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tajo.storage.orc.objectinspector; + +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; + +public class TajoNullObjectInspector extends TajoPrimitiveObjectInspector { + @Override + public PrimitiveTypeInfo getTypeInfo() { + return TypeInfoFactory.voidTypeInfo; + } + + @Override + public PrimitiveCategory getPrimitiveCategory() { + return PrimitiveCategory.VOID; + } + + @Override + public Class getPrimitiveWritableClass() { + return null; + } + + @Override + public Object getPrimitiveWritableObject(Object o) { + return null; + } + + @Override + public Class getJavaPrimitiveClass() { + return Void.class; + } + + @Override + public Object getPrimitiveJavaObject(Object o) { + return null; + } + + @Override + public Object copyObject(Object o) { + return null; + } + + @Override + public boolean preferWritable() { + return false; + } + + @Override + public String getTypeName() { + return "NULL"; + } +} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java index d7a43e1885..48aaf29c6d 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java @@ -244,7 +244,7 @@ public Datum deserialize(int columnIndex, ByteBuf buf, ByteBuf nullChars) throws byte[] bytes = new byte[buf.readableBytes()]; buf.readBytes(bytes); protobufJsonFormat.merge(bytes, builder); - datum = factory.createDatum(builder.build()); + datum = ProtobufDatumFactory.createDatum(builder.build()); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java index 853ad6f8db..8742db1c8e 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java @@ -23,6 +23,7 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.tajo.datum.BlobDatum; import org.apache.tajo.datum.DateDatum; +import org.apache.tajo.datum.Datum; import java.sql.Date; import java.sql.Timestamp; @@ -495,7 +496,7 @@ void reset() { } @Override - void updateBinary(BlobDatum value) { + void updateBinary(Datum value) { sum += value.size(); } @@ -907,7 +908,7 @@ void updateString(String value) { throw new UnsupportedOperationException("Can't update string"); } - void updateBinary(BlobDatum value) { + void updateBinary(Datum value) { throw new UnsupportedOperationException("Can't update binary"); } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index 2fdbc59d0e..df34ab2449 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -996,7 +996,7 @@ void write(Datum datum) throws IOException { super.write(datum); if (datum != null && datum.isNotNull()) { long val; - if (datum instanceof Int4Datum) { + if (datum instanceof Int4Datum || datum instanceof Inet4Datum) { val = datum.asInt4(); } else if (datum instanceof Int8Datum) { val = datum.asInt8(); @@ -1418,12 +1418,11 @@ OrcProto.ColumnEncoding getEncoding() { void write(Datum datum) throws IOException { super.write(datum); if (datum != null && datum.isNotNull()) { - BlobDatum val = (BlobDatum)datum; - stream.write(val.asByteArray(), 0, val.size()); - length.write(val.size()); - indexStatistics.updateBinary(val); + stream.write(datum.asByteArray(), 0, datum.size()); + length.write(datum.size()); + indexStatistics.updateBinary(datum); if (createBloomFilter) { - bloomFilter.addBytes(val.asByteArray(), val.size()); + bloomFilter.addBytes(datum.asByteArray(), datum.size()); } } } @@ -1631,6 +1630,7 @@ private static TreeWriter createTreeWriter(ObjectInspector inspector, case PRIMITIVE: switch (((PrimitiveObjectInspector) inspector).getPrimitiveCategory()) { case BOOLEAN: + case VOID: return new BooleanTreeWriter(streamFactory.getNextColumnId(), inspector, streamFactory, nullable); case BYTE: @@ -1685,6 +1685,7 @@ private static void writeTypes(OrcProto.Footer.Builder builder, case PRIMITIVE: switch (((PrimitiveObjectInspector) treeWriter.inspector). getPrimitiveCategory()) { + case VOID: case BOOLEAN: type.setKind(OrcProto.Type.Kind.BOOLEAN); break; diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java index b53dbeccd0..8dd956dcf4 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java @@ -139,6 +139,7 @@ public static Collection generateParameters() { {"RAW", false, true, true}, {"RCFILE", true, true, false}, {"PARQUET", false, false, false}, + {"ORC", true, true, false}, {"SEQUENCEFILE", true, true, false}, {"AVRO", false, false, false}, {"TEXT", true, true, true}, diff --git a/tajo-storage/tajo-storage-hdfs/src/test/resources/storage-default.xml b/tajo-storage/tajo-storage-hdfs/src/test/resources/storage-default.xml index 6a9e7ce75e..23041cf5e0 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-hdfs/src/test/resources/storage-default.xml @@ -38,7 +38,7 @@ tajo.storage.scanner-handler - text,csv,json,raw,rcfile,row,parquet,sequencefile,avro + text,csv,json,raw,rcfile,row,parquet,orc,sequencefile,avro @@ -70,6 +70,10 @@ tajo.storage.fragment.parquet.class org.apache.tajo.storage.fragment.FileFragment + + tajo.storage.fragment.orc.class + org.apache.tajo.storage.fragment.FileFragment + tajo.storage.fragment.sequencefile.class org.apache.tajo.storage.fragment.FileFragment @@ -115,6 +119,11 @@ org.apache.tajo.storage.parquet.ParquetScanner + + tajo.storage.scanner-handler.orc.class + org.apache.tajo.storage.orc.ORCScanner + + tajo.storage.scanner-handler.sequencefile.class org.apache.tajo.storage.sequencefile.SequenceFileScanner @@ -128,7 +137,7 @@ tajo.storage.appender-handler - text,csv,raw,rcfile,row,parquet,sequencefile,avro + text,csv,raw,rcfile,row,parquet,orc,sequencefile,avro @@ -166,6 +175,11 @@ org.apache.tajo.storage.parquet.ParquetAppender + + tajo.storage.appender-handler.orc.class + org.apache.tajo.storage.orc.ORCAppender + + tajo.storage.appender-handler.sequencefile.class org.apache.tajo.storage.sequencefile.SequenceFileAppender From 8a42f14580c1f68aec2e8724742ebf86cf2d24cb Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 15 Jul 2015 18:52:52 +0900 Subject: [PATCH 116/141] refactoring --- .../src/main/java/org/apache/tajo/datum/DatumFactory.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java b/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java index 480582ae7b..0b3f9207b7 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java @@ -143,11 +143,8 @@ public static Datum createFromBytes(DataType dataType, byte[] bytes) { case INET4: return createInet4(bytes); case PROTOBUF: - ProtobufDatumFactory factory = ProtobufDatumFactory.get(dataType); - Message.Builder builder = factory.newBuilder(); try { - builder.mergeFrom(bytes); - return factory.createDatum(builder.build()); + return ProtobufDatumFactory.createDatum(dataType, bytes); } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); From 7d11bbc6b2fe11b2ea1d01f3afc33fd1d92f1e7f Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 16 Jul 2015 16:36:34 +0900 Subject: [PATCH 117/141] test passed --- .../src/main/proto/CatalogProtos.proto | 2 +- .../src/test/resources/storage-default.xml | 7 ++++++- .../orc/{OrcAppender.java => ORCAppender.java} | 8 ++++---- .../apache/tajo/storage/orc/ORCScanner.java | 3 +++ .../TajoBlobObjectInspector.java | 18 ++++++++++++++++++ .../TajoBooleanObjectInspector.java | 18 ++++++++++++++++++ .../TajoStructObjectInspector.java | 4 +--- .../storage/thirdparty/orc/WriterImpl.java | 2 +- .../org/apache/tajo/storage/TestStorages.java | 4 ++-- .../org/apache/tajo/storage/orc/TestOrc.java | 15 +++++++-------- 10 files changed, 61 insertions(+), 20 deletions(-) rename tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/{OrcAppender.java => ORCAppender.java} (94%) diff --git a/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto b/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto index 4eb4af4735..0533e6d9e4 100644 --- a/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto +++ b/tajo-catalog/tajo-catalog-common/src/main/proto/CatalogProtos.proto @@ -32,7 +32,7 @@ enum StoreType { RCFILE = 3; ROWFILE = 4; HCFILE = 5; - ORCFILE = 6; + ORC = 6; PARQUET = 7; SEQUENCEFILE = 8; AVRO = 9; diff --git a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml index 41804b3ee7..58ee5f8d72 100644 --- a/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml +++ b/tajo-storage/tajo-storage-common/src/test/resources/storage-default.xml @@ -112,7 +112,7 @@ tajo.storage.scanner-handler.orc.class - org.apache.tajo.storage.orc.OrcScanner + org.apache.tajo.storage.orc.ORCScanner @@ -161,6 +161,11 @@ org.apache.tajo.storage.parquet.ParquetAppender + + tajo.storage.appender-handler.orc.class + org.apache.tajo.storage.orc.ORCAppender + + tajo.storage.appender-handler.sequencefile.class org.apache.tajo.storage.sequencefile.SequenceFileAppender diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java similarity index 94% rename from tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java rename to tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java index e06c6d3472..f01d30cdc0 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/OrcAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java @@ -35,13 +35,13 @@ import java.io.IOException; -public class OrcAppender extends FileAppender { +public class ORCAppender extends FileAppender { private Writer writer; private TableStatistics stats; - public OrcAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, - TableMeta meta, Path workDir) { + public ORCAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, + TableMeta meta, Path workDir) { super(conf, taskAttemptId, schema, meta, workDir); } @@ -72,7 +72,7 @@ public long getOffset() throws IOException { public void addTuple(Tuple tuple) throws IOException { if (enabledStats) { for (int i = 0; i < schema.size(); ++i) { - stats.analyzeField(i, tuple.get(i)); + stats.analyzeField(i, tuple); } } writer.addTuple(tuple); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java index cca83de57e..e69c5ecb71 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java @@ -249,6 +249,9 @@ private Datum createValueDatum(Vector vector, TajoDataTypes.DataType type) { case PROTOBUF: try { + if (((SliceVector) vector).vector[currentPosInBatch] == null) + return NullDatum.get(); + return ProtobufDatumFactory.createDatum(type, ((SliceVector) vector).vector[currentPosInBatch].getBytes()); } catch (InvalidProtocolBufferException e) { diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java index 0b2d0a70b0..d241f84371 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBlobObjectInspector.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.tajo.storage.orc.objectinspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java index 4fa65fd59b..273505f0cb 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoBooleanObjectInspector.java @@ -1,3 +1,21 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.tajo.storage.orc.objectinspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java index 177c5553a0..a8b4b4945b 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/objectinspector/TajoStructObjectInspector.java @@ -35,7 +35,6 @@ public class TajoStructObjectInspector extends StructObjectInspector { private final static Log LOG = LogFactory.getLog(TajoStructObjectInspector.class); - private Schema schema; private List structFields; static class TajoStructField implements StructField { @@ -70,10 +69,9 @@ public String getFieldComment() { } TajoStructObjectInspector(Schema schema) { - this.schema = schema; structFields = new ArrayList(schema.size()); - for (Column c: schema.getColumns()) { + for (Column c: schema.getRootColumns()) { try { TajoStructField field = new TajoStructField(c.getSimpleName(), ObjectInspectorFactory.buildObjectInspectorByType(c.getDataType().getType())); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index df34ab2449..f9a5e8d680 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -1607,7 +1607,7 @@ void writeTuple(Tuple tuple) throws IOException { if (tuple != null) { for(int i = 0; i < fields.size(); ++i) { TreeWriter writer = childrenWriters[i]; - writer.write(tuple.get(i)); + writer.write(tuple.asDatum(i)); } } } diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java index 8dd956dcf4..53f8f1a893 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java @@ -139,7 +139,7 @@ public static Collection generateParameters() { {"RAW", false, true, true}, {"RCFILE", true, true, false}, {"PARQUET", false, false, false}, - {"ORC", true, true, false}, + {"ORC", false, true, false}, {"SEQUENCEFILE", true, true, false}, {"AVRO", false, false, false}, {"TEXT", true, true, true}, @@ -959,7 +959,7 @@ int record = 4 + 8 + 2 + 5 + 8; // required size is 27 public void testLessThanSchemaSize() throws IOException { /* RAW is internal storage. It must be same with schema size */ if (storeType.equalsIgnoreCase("RAW") || storeType.equalsIgnoreCase("AVRO") - || storeType.equalsIgnoreCase("PARQUET")){ + || storeType.equalsIgnoreCase("PARQUET") || storeType.equalsIgnoreCase("ORC")){ return; } diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java index e8833bad36..5a039d0b81 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java @@ -26,7 +26,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; -import org.apache.tajo.catalog.proto.CatalogProtos; import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.conf.TajoConf; import org.apache.tajo.datum.Int2Datum; @@ -51,7 +50,7 @@ import java.util.List; public class TestOrc { - private OrcScanner orcScanner; + private ORCScanner orcScanner; private static Configuration conf = new TajoConf(); private static FileSystem fs; @@ -84,11 +83,11 @@ public void setup() throws IOException { schema.addColumn("unixtimestamp", TajoDataTypes.Type.TEXT); schema.addColumn("faketime", TajoDataTypes.Type.TIMESTAMP); - TableMeta meta = new TableMeta(CatalogProtos.StoreType.ORC, new KeyValueSet()); + TableMeta meta = new TableMeta("ORC", new KeyValueSet()); Fragment fragment = getFileFragment("u_data_20.orc"); - orcScanner = new OrcScanner(conf, schema, meta, fragment); + orcScanner = new ORCScanner(conf, schema, meta, fragment); orcScanner.init(); } @@ -104,7 +103,7 @@ public void testReadTuple() { assertEquals(tuple.getText(3), "881250949"); // Timestamp test - TimestampDatum timestamp = (TimestampDatum)tuple.get(4); + TimestampDatum timestamp = (TimestampDatum)tuple.asDatum(4); assertEquals(timestamp.getYear(), 2008); assertEquals(timestamp.getMonthOfYear(), 12); @@ -135,9 +134,9 @@ public void testWrite() { fs.delete(writePath); } - TableMeta meta = new TableMeta(CatalogProtos.StoreType.ORC, new KeyValueSet()); + TableMeta meta = new TableMeta("ORC", new KeyValueSet()); - OrcAppender appender = new OrcAppender(conf, null, schema, meta, writePath); + ORCAppender appender = new ORCAppender(conf, null, schema, meta, writePath); appender.init(); @@ -152,7 +151,7 @@ public void testWrite() { appender.close(); Fragment fragment = getFileFragment("temp_test.orc"); - OrcScanner orcScanner = new OrcScanner(conf, schema, meta, fragment); + ORCScanner orcScanner = new ORCScanner(conf, schema, meta, fragment); orcScanner.init(); tuple = orcScanner.next(); From bcf6e84c57842e0d8d87e7aaf522a659a3acec55 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 24 Jul 2015 12:07:35 +0900 Subject: [PATCH 118/141] remove files not used --- .../thirdparty/orc/OrcFileKeyWrapper.java | 114 ---- .../thirdparty/orc/OrcFileValueWrapper.java | 92 --- .../tajo/storage/thirdparty/orc/OrcSerde.java | 156 ----- .../storage/thirdparty/orc/OrcStruct.java | 607 ------------------ .../tajo/storage/thirdparty/orc/OrcUnion.java | 160 ----- .../thirdparty/orc/PositionProvider.java | 26 - .../thirdparty/orc/VectorizedOrcSerde.java | 88 --- 7 files changed, 1243 deletions(-) delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileKeyWrapper.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileValueWrapper.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcSerde.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcStruct.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUnion.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionProvider.java delete mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/VectorizedOrcSerde.java diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileKeyWrapper.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileKeyWrapper.java deleted file mode 100644 index ce72cf4c0d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileKeyWrapper.java +++ /dev/null @@ -1,114 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.WritableComparable; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; - -/** - * Key for OrcFileMergeMapper task. Contains orc file related information that - * should match before merging two orc files. - */ -public class OrcFileKeyWrapper implements WritableComparable { - - private Path inputPath; - private CompressionKind compression; - private long compressBufferSize; - private List types; - private int rowIndexStride; - private OrcFile.Version version; - private boolean isIncompatFile; - - public boolean isIncompatFile() { - return isIncompatFile; - } - - public void setIsIncompatFile(boolean isIncompatFile) { - this.isIncompatFile = isIncompatFile; - } - - public OrcFile.Version getVersion() { - return version; - } - - public void setVersion(OrcFile.Version version) { - this.version = version; - } - - public int getRowIndexStride() { - return rowIndexStride; - } - - public void setRowIndexStride(int rowIndexStride) { - this.rowIndexStride = rowIndexStride; - } - - public long getCompressBufferSize() { - return compressBufferSize; - } - - public void setCompressBufferSize(long compressBufferSize) { - this.compressBufferSize = compressBufferSize; - } - - public CompressionKind getCompression() { - return compression; - } - - public void setCompression(CompressionKind compression) { - this.compression = compression; - } - - public List getTypes() { - return types; - } - - public void setTypes(List types) { - this.types = types; - } - - public Path getInputPath() { - return inputPath; - } - - public void setInputPath(Path inputPath) { - this.inputPath = inputPath; - } - - @Override - public void write(DataOutput out) throws IOException { - throw new RuntimeException("Not supported."); - } - - @Override - public void readFields(DataInput in) throws IOException { - throw new RuntimeException("Not supported."); - } - - @Override - public int compareTo(OrcFileKeyWrapper o) { - return inputPath.compareTo(o.inputPath); - } - -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileValueWrapper.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileValueWrapper.java deleted file mode 100644 index 77daf6c289..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFileValueWrapper.java +++ /dev/null @@ -1,92 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.io.WritableComparable; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.List; - -/** - * Value for OrcFileMergeMapper. Contains stripe related information for the - * current orc file that is being merged. - */ -public class OrcFileValueWrapper implements WritableComparable { - - protected StripeInformation stripeInformation; - protected OrcProto.StripeStatistics stripeStatistics; - protected List userMetadata; - protected boolean lastStripeInFile; - - public List getUserMetadata() { - return userMetadata; - } - - public void setUserMetadata(List userMetadata) { - this.userMetadata = userMetadata; - } - - public boolean isLastStripeInFile() { - return lastStripeInFile; - } - - public void setLastStripeInFile(boolean lastStripeInFile) { - this.lastStripeInFile = lastStripeInFile; - } - - public OrcProto.StripeStatistics getStripeStatistics() { - return stripeStatistics; - } - - public void setStripeStatistics(OrcProto.StripeStatistics stripeStatistics) { - this.stripeStatistics = stripeStatistics; - } - - public StripeInformation getStripeInformation() { - return stripeInformation; - } - - public void setStripeInformation(StripeInformation stripeInformation) { - this.stripeInformation = stripeInformation; - } - - @Override - public void write(DataOutput out) throws IOException { - throw new RuntimeException("Not supported."); - } - - @Override - public void readFields(DataInput in) throws IOException { - throw new RuntimeException("Not supported."); - } - - @Override - public int compareTo(OrcFileValueWrapper o) { - if (stripeInformation.getOffset() < o.getStripeInformation().getOffset()) { - return -1; - } else if (stripeInformation.getOffset() > o.getStripeInformation().getOffset()) { - return 1; - } else { - return 0; - } - } - -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcSerde.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcSerde.java deleted file mode 100644 index 087e8a9e9b..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcSerde.java +++ /dev/null @@ -1,156 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.VectorizedSerde; -import org.apache.hadoop.hive.serde.serdeConstants; -import org.apache.hadoop.hive.serde2.SerDe; -import org.apache.hadoop.hive.serde2.SerDeException; -import org.apache.hadoop.hive.serde2.SerDeStats; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils; -import org.apache.hadoop.io.Writable; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Properties; - -/** - * A serde class for ORC. - * It transparently passes the object to/from the ORC file reader/writer. - */ -public class OrcSerde implements SerDe, VectorizedSerde { - - private final OrcSerdeRow row = new OrcSerdeRow(); - private ObjectInspector inspector = null; - - private VectorizedOrcSerde vos = null; - - final class OrcSerdeRow implements Writable { - Object realRow; - ObjectInspector inspector; - - @Override - public void write(DataOutput dataOutput) throws IOException { - throw new UnsupportedOperationException("can't write the bundle"); - } - - @Override - public void readFields(DataInput dataInput) throws IOException { - throw new UnsupportedOperationException("can't read the bundle"); - } - - ObjectInspector getInspector() { - return inspector; - } - - Object getRow() { - return realRow; - } - } - - @Override - public void initialize(Configuration conf, Properties table) { - // Read the configuration parameters - String columnNameProperty = table.getProperty(serdeConstants.LIST_COLUMNS); - // NOTE: if "columns.types" is missing, all columns will be of String type - String columnTypeProperty = table.getProperty(serdeConstants.LIST_COLUMN_TYPES); - - // Parse the configuration parameters - ArrayList columnNames = new ArrayList(); - if (columnNameProperty != null && columnNameProperty.length() > 0) { - for (String name : columnNameProperty.split(",")) { - columnNames.add(name); - } - } - if (columnTypeProperty == null) { - // Default type: all string - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < columnNames.size(); i++) { - if (i > 0) { - sb.append(":"); - } - sb.append("string"); - } - columnTypeProperty = sb.toString(); - } - - ArrayList fieldTypes = - TypeInfoUtils.getTypeInfosFromTypeString(columnTypeProperty); - StructTypeInfo rootType = new StructTypeInfo(); - rootType.setAllStructFieldNames(columnNames); - rootType.setAllStructFieldTypeInfos(fieldTypes); - inspector = OrcStruct.createObjectInspector(rootType); - } - - @Override - public Class getSerializedClass() { - return OrcSerdeRow.class; - } - - @Override - public Writable serialize(Object realRow, ObjectInspector inspector) { - row.realRow = realRow; - row.inspector = inspector; - return row; - } - - @Override - public Object deserialize(Writable writable) throws SerDeException { - return writable; - } - - @Override - public ObjectInspector getObjectInspector() throws SerDeException { - return inspector; - } - - /** - * Always returns null, since serialized size doesn't make sense in the - * context of ORC files. - * - * @return null - */ - @Override - public SerDeStats getSerDeStats() { - return null; - } - - @Override - public Writable serializeVector(VectorizedRowBatch vrg, ObjectInspector objInspector) - throws SerDeException { - if (vos == null) { - vos = new VectorizedOrcSerde(getObjectInspector()); - } - return vos.serialize(vrg, getObjectInspector()); - } - - @Override - public void deserializeVector(Object rowBlob, int rowsInBatch, VectorizedRowBatch reuseBatch) - throws SerDeException { - // nothing to do here - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcStruct.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcStruct.java deleted file mode 100644 index 6c1d779e86..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcStruct.java +++ /dev/null @@ -1,607 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.hive.common.type.HiveDecimal; -import org.apache.hadoop.hive.serde2.objectinspector.*; -import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; -import org.apache.hadoop.hive.serde2.typeinfo.*; -import org.apache.hadoop.io.Writable; - -import java.io.DataInput; -import java.io.DataOutput; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -final public class OrcStruct implements Writable { - - private Object[] fields; - - OrcStruct(int children) { - fields = new Object[children]; - } - - Object getFieldValue(int fieldIndex) { - return fields[fieldIndex]; - } - - void setFieldValue(int fieldIndex, Object value) { - fields[fieldIndex] = value; - } - - public int getNumFields() { - return fields.length; - } - - /** - * Change the number of fields in the struct. No effect if the number of - * fields is the same. The old field values are copied to the new array. - * @param numFields the new number of fields - */ - public void setNumFields(int numFields) { - if (fields.length != numFields) { - Object[] oldFields = fields; - fields = new Object[numFields]; - System.arraycopy(oldFields, 0, fields, 0, - Math.min(oldFields.length, numFields)); - } - } - - /** - * Destructively make this object link to other's values. - * @param other the value to point to - */ - void linkFields(OrcStruct other) { - fields = other.fields; - } - - @Override - public void write(DataOutput dataOutput) throws IOException { - throw new UnsupportedOperationException("write unsupported"); - } - - @Override - public void readFields(DataInput dataInput) throws IOException { - throw new UnsupportedOperationException("readFields unsupported"); - } - - @Override - public boolean equals(Object other) { - if (other == null || other.getClass() != OrcStruct.class) { - return false; - } else { - OrcStruct oth = (OrcStruct) other; - if (fields.length != oth.fields.length) { - return false; - } - for(int i=0; i < fields.length; ++i) { - if (fields[i] == null) { - if (oth.fields[i] != null) { - return false; - } - } else { - if (!fields[i].equals(oth.fields[i])) { - return false; - } - } - } - return true; - } - } - - @Override - public int hashCode() { - int result = fields.length; - for(Object field: fields) { - if (field != null) { - result ^= field.hashCode(); - } - } - return result; - } - - @Override - public String toString() { - StringBuilder buffer = new StringBuilder(); - buffer.append("{"); - for(int i=0; i < fields.length; ++i) { - if (i != 0) { - buffer.append(", "); - } - buffer.append(fields[i]); - } - buffer.append("}"); - return buffer.toString(); - } - - static class Field implements StructField { - private final String name; - private final ObjectInspector inspector; - private final int offset; - - Field(String name, ObjectInspector inspector, int offset) { - this.name = name; - this.inspector = inspector; - this.offset = offset; - } - - @Override - public String getFieldName() { - return name; - } - - @Override - public ObjectInspector getFieldObjectInspector() { - return inspector; - } - - public int getFieldID() { - return offset; - } - - @Override - public String getFieldComment() { - return null; - } - } - - static class OrcStructInspector extends SettableStructObjectInspector { - private List fields; - - protected OrcStructInspector() { - super(); - } - - OrcStructInspector(List fields) { - this.fields = fields; - } - - OrcStructInspector(StructTypeInfo info) { - ArrayList fieldNames = info.getAllStructFieldNames(); - ArrayList fieldTypes = info.getAllStructFieldTypeInfos(); - fields = new ArrayList(fieldNames.size()); - for(int i=0; i < fieldNames.size(); ++i) { - fields.add(new Field(fieldNames.get(i), - createObjectInspector(fieldTypes.get(i)), i)); - } - } - - OrcStructInspector(int columnId, List types) { - OrcProto.Type type = types.get(columnId); - int fieldCount = type.getSubtypesCount(); - fields = new ArrayList(fieldCount); - for(int i=0; i < fieldCount; ++i) { - int fieldType = type.getSubtypes(i); - fields.add(new Field(type.getFieldNames(i), - createObjectInspector(fieldType, types), i)); - } - } - - @Override - public List getAllStructFieldRefs() { - return fields; - } - - @Override - public StructField getStructFieldRef(String s) { - for(StructField field: fields) { - if (field.getFieldName().equalsIgnoreCase(s)) { - return field; - } - } - return null; - } - - @Override - public Object getStructFieldData(Object object, StructField field) { - if (object == null) { - return null; - } - int offset = ((Field) field).offset; - OrcStruct struct = (OrcStruct) object; - if (offset >= struct.fields.length) { - return null; - } - - return struct.fields[offset]; - } - - @Override - public List getStructFieldsDataAsList(Object object) { - if (object == null) { - return null; - } - OrcStruct struct = (OrcStruct) object; - List result = new ArrayList(struct.fields.length); - for (Object child: struct.fields) { - result.add(child); - } - return result; - } - - @Override - public String getTypeName() { - StringBuilder buffer = new StringBuilder(); - buffer.append("struct<"); - for(int i=0; i < fields.size(); ++i) { - StructField field = fields.get(i); - if (i != 0) { - buffer.append(","); - } - buffer.append(field.getFieldName()); - buffer.append(":"); - buffer.append(field.getFieldObjectInspector().getTypeName()); - } - buffer.append(">"); - return buffer.toString(); - } - - @Override - public Category getCategory() { - return Category.STRUCT; - } - - @Override - public Object create() { - return new OrcStruct(0); - } - - @Override - public Object setStructFieldData(Object struct, StructField field, - Object fieldValue) { - OrcStruct orcStruct = (OrcStruct) struct; - int offset = ((Field) field).offset; - // if the offset is bigger than our current number of fields, grow it - if (orcStruct.getNumFields() <= offset) { - orcStruct.setNumFields(offset+1); - } - orcStruct.setFieldValue(offset, fieldValue); - return struct; - } - - @Override - public boolean equals(Object o) { - if (o == null || o.getClass() != getClass()) { - return false; - } else if (o == this) { - return true; - } else { - List other = ((OrcStructInspector) o).fields; - if (other.size() != fields.size()) { - return false; - } - for(int i = 0; i < fields.size(); ++i) { - StructField left = other.get(i); - StructField right = fields.get(i); - if (!(left.getFieldName().equalsIgnoreCase(right.getFieldName()) && - left.getFieldObjectInspector().equals - (right.getFieldObjectInspector()))) { - return false; - } - } - return true; - } - } - } - - static class OrcMapObjectInspector - implements MapObjectInspector, SettableMapObjectInspector { - private ObjectInspector key; - private ObjectInspector value; - - private OrcMapObjectInspector() { - super(); - } - OrcMapObjectInspector(MapTypeInfo info) { - key = createObjectInspector(info.getMapKeyTypeInfo()); - value = createObjectInspector(info.getMapValueTypeInfo()); - } - - OrcMapObjectInspector(int columnId, List types) { - OrcProto.Type type = types.get(columnId); - key = createObjectInspector(type.getSubtypes(0), types); - value = createObjectInspector(type.getSubtypes(1), types); - } - - @Override - public ObjectInspector getMapKeyObjectInspector() { - return key; - } - - @Override - public ObjectInspector getMapValueObjectInspector() { - return value; - } - - @Override - public Object getMapValueElement(Object map, Object key) { - return ((map == null || key == null)? null : ((Map) map).get(key)); - } - - @Override - @SuppressWarnings("unchecked") - public Map getMap(Object map) { - if (map == null) { - return null; - } - return (Map) map; - } - - @Override - public int getMapSize(Object map) { - if (map == null) { - return -1; - } - return ((Map) map).size(); - } - - @Override - public String getTypeName() { - return "map<" + key.getTypeName() + "," + value.getTypeName() + ">"; - } - - @Override - public Category getCategory() { - return Category.MAP; - } - - @Override - public Object create() { - return new HashMap(); - } - - @Override - public Object put(Object map, Object key, Object value) { - ((Map) map).put(key, value); - return map; - } - - @Override - public Object remove(Object map, Object key) { - ((Map) map).remove(key); - return map; - } - - @Override - public Object clear(Object map) { - ((Map) map).clear(); - return map; - } - - @Override - public boolean equals(Object o) { - if (o == null || o.getClass() != getClass()) { - return false; - } else if (o == this) { - return true; - } else { - OrcMapObjectInspector other = (OrcMapObjectInspector) o; - return other.key.equals(key) && other.value.equals(value); - } - } - } - - static class OrcListObjectInspector - implements ListObjectInspector, SettableListObjectInspector { - private ObjectInspector child; - - private OrcListObjectInspector() { - super(); - } - OrcListObjectInspector(ListTypeInfo info) { - child = createObjectInspector(info.getListElementTypeInfo()); - } - - OrcListObjectInspector(int columnId, List types) { - OrcProto.Type type = types.get(columnId); - child = createObjectInspector(type.getSubtypes(0), types); - } - - @Override - public ObjectInspector getListElementObjectInspector() { - return child; - } - - @Override - public Object getListElement(Object list, int i) { - if (list == null) { - return null; - } - return ((List) list).get(i); - } - - @Override - public int getListLength(Object list) { - if (list == null) { - return -1; - } - return ((List) list).size(); - } - - @Override - @SuppressWarnings("unchecked") - public List getList(Object list) { - if (list == null) { - return null; - } - return (List) list; - } - - @Override - public String getTypeName() { - return "array<" + child.getTypeName() + ">"; - } - - @Override - public Category getCategory() { - return Category.LIST; - } - - @Override - public Object create(int size) { - ArrayList result = new ArrayList(size); - for(int i = 0; i < size; ++i) { - result.add(null); - } - return result; - } - - @Override - public Object set(Object list, int index, Object element) { - List l = (List) list; - for(int i=l.size(); i < index+1; ++i) { - l.add(null); - } - l.set(index, element); - return list; - } - - @Override - public Object resize(Object list, int newSize) { - ((ArrayList) list).ensureCapacity(newSize); - return list; - } - - @Override - public boolean equals(Object o) { - if (o == null || o.getClass() != getClass()) { - return false; - } else if (o == this) { - return true; - } else { - ObjectInspector other = ((OrcListObjectInspector) o).child; - return other.equals(child); - } - } - } - - static public ObjectInspector createObjectInspector(TypeInfo info) { - switch (info.getCategory()) { - case PRIMITIVE: - switch (((PrimitiveTypeInfo) info).getPrimitiveCategory()) { - case FLOAT: - return PrimitiveObjectInspectorFactory.writableFloatObjectInspector; - case DOUBLE: - return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; - case BOOLEAN: - return PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; - case BYTE: - return PrimitiveObjectInspectorFactory.writableByteObjectInspector; - case SHORT: - return PrimitiveObjectInspectorFactory.writableShortObjectInspector; - case INT: - return PrimitiveObjectInspectorFactory.writableIntObjectInspector; - case LONG: - return PrimitiveObjectInspectorFactory.writableLongObjectInspector; - case BINARY: - return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; - case STRING: - return PrimitiveObjectInspectorFactory.writableStringObjectInspector; - case CHAR: - return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( - (PrimitiveTypeInfo) info); - case VARCHAR: - return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( - (PrimitiveTypeInfo) info); - case TIMESTAMP: - return PrimitiveObjectInspectorFactory.writableTimestampObjectInspector; - case DATE: - return PrimitiveObjectInspectorFactory.writableDateObjectInspector; - case DECIMAL: - return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( - (PrimitiveTypeInfo) info); - default: - throw new IllegalArgumentException("Unknown primitive type " + - ((PrimitiveTypeInfo) info).getPrimitiveCategory()); - } - case STRUCT: - return new OrcStructInspector((StructTypeInfo) info); - case UNION: - return new OrcUnion.OrcUnionObjectInspector((UnionTypeInfo) info); - case MAP: - return new OrcMapObjectInspector((MapTypeInfo) info); - case LIST: - return new OrcListObjectInspector((ListTypeInfo) info); - default: - throw new IllegalArgumentException("Unknown type " + - info.getCategory()); - } - } - - static ObjectInspector createObjectInspector(int columnId, - List types){ - OrcProto.Type type = types.get(columnId); - switch (type.getKind()) { - case FLOAT: - return PrimitiveObjectInspectorFactory.writableFloatObjectInspector; - case DOUBLE: - return PrimitiveObjectInspectorFactory.writableDoubleObjectInspector; - case BOOLEAN: - return PrimitiveObjectInspectorFactory.writableBooleanObjectInspector; - case BYTE: - return PrimitiveObjectInspectorFactory.writableByteObjectInspector; - case SHORT: - return PrimitiveObjectInspectorFactory.writableShortObjectInspector; - case INT: - return PrimitiveObjectInspectorFactory.writableIntObjectInspector; - case LONG: - return PrimitiveObjectInspectorFactory.writableLongObjectInspector; - case BINARY: - return PrimitiveObjectInspectorFactory.writableBinaryObjectInspector; - case STRING: - return PrimitiveObjectInspectorFactory.writableStringObjectInspector; - case CHAR: - if (!type.hasMaximumLength()) { - throw new UnsupportedOperationException( - "Illegal use of char type without length in ORC type definition."); - } - return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( - TypeInfoFactory.getCharTypeInfo(type.getMaximumLength())); - case VARCHAR: - if (!type.hasMaximumLength()) { - throw new UnsupportedOperationException( - "Illegal use of varchar type without length in ORC type definition."); - } - return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( - TypeInfoFactory.getVarcharTypeInfo(type.getMaximumLength())); - case TIMESTAMP: - return PrimitiveObjectInspectorFactory.writableTimestampObjectInspector; - case DATE: - return PrimitiveObjectInspectorFactory.writableDateObjectInspector; - case DECIMAL: - int precision = type.hasPrecision() ? type.getPrecision() : HiveDecimal.SYSTEM_DEFAULT_PRECISION; - int scale = type.hasScale()? type.getScale() : HiveDecimal.SYSTEM_DEFAULT_SCALE; - return PrimitiveObjectInspectorFactory.getPrimitiveWritableObjectInspector( - TypeInfoFactory.getDecimalTypeInfo(precision, scale)); - case STRUCT: - return new OrcStructInspector(columnId, types); - case UNION: - return new OrcUnion.OrcUnionObjectInspector(columnId, types); - case MAP: - return new OrcMapObjectInspector(columnId, types); - case LIST: - return new OrcListObjectInspector(columnId, types); - default: - throw new UnsupportedOperationException("Unknown type " + - type.getKind()); - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUnion.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUnion.java deleted file mode 100644 index 1bc2b5d38e..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcUnion.java +++ /dev/null @@ -1,160 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.UnionObject; -import org.apache.hadoop.hive.serde2.objectinspector.UnionObjectInspector; -import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.UnionTypeInfo; - -import java.util.ArrayList; -import java.util.List; - -/** - * An in-memory representation of a union type. - */ -final class OrcUnion implements UnionObject { - private byte tag; - private Object object; - - void set(byte tag, Object object) { - this.tag = tag; - this.object = object; - } - - @Override - public byte getTag() { - return tag; - } - - @Override - public Object getObject() { - return object; - } - - @Override - public boolean equals(Object other) { - if (other == null || other.getClass() != OrcUnion.class) { - return false; - } - OrcUnion oth = (OrcUnion) other; - if (tag != oth.tag) { - return false; - } else if (object == null) { - return oth.object == null; - } else { - return object.equals(oth.object); - } - } - - @Override - public int hashCode() { - int result = tag; - if (object != null) { - result ^= object.hashCode(); - } - return result; - } - - @Override - public String toString() { - return "union(" + Integer.toString(tag & 0xff) + ", " + object + ")"; - } - - static class OrcUnionObjectInspector implements UnionObjectInspector { - private List children; - - protected OrcUnionObjectInspector() { - super(); - } - OrcUnionObjectInspector(int columnId, - List types) { - OrcProto.Type type = types.get(columnId); - children = new ArrayList(type.getSubtypesCount()); - for(int i=0; i < type.getSubtypesCount(); ++i) { - children.add(OrcStruct.createObjectInspector(type.getSubtypes(i), - types)); - } - } - - OrcUnionObjectInspector(UnionTypeInfo info) { - List unionChildren = info.getAllUnionObjectTypeInfos(); - this.children = new ArrayList(unionChildren.size()); - for(TypeInfo child: info.getAllUnionObjectTypeInfos()) { - this.children.add(OrcStruct.createObjectInspector(child)); - } - } - - @Override - public List getObjectInspectors() { - return children; - } - - @Override - public byte getTag(Object obj) { - return ((OrcUnion) obj).tag; - } - - @Override - public Object getField(Object obj) { - return ((OrcUnion) obj).object; - } - - @Override - public String getTypeName() { - StringBuilder builder = new StringBuilder("uniontype<"); - boolean first = true; - for(ObjectInspector child: children) { - if (first) { - first = false; - } else { - builder.append(","); - } - builder.append(child.getTypeName()); - } - builder.append(">"); - return builder.toString(); - } - - @Override - public Category getCategory() { - return Category.UNION; - } - - @Override - public boolean equals(Object o) { - if (o == null || o.getClass() != getClass()) { - return false; - } else if (o == this) { - return true; - } else { - List other = ((OrcUnionObjectInspector) o).children; - if (other.size() != children.size()) { - return false; - } - for(int i = 0; i < children.size(); ++i) { - if (!other.get(i).equals(children.get(i))) { - return false; - } - } - return true; - } - } - } -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionProvider.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionProvider.java deleted file mode 100644 index 54b5ab6da1..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/PositionProvider.java +++ /dev/null @@ -1,26 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.thirdparty.orc; - -/** - * An interface used for seeking to a row index. - */ -public interface PositionProvider { - long getNext(); -} diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/VectorizedOrcSerde.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/VectorizedOrcSerde.java deleted file mode 100644 index aea1d89e4d..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/VectorizedOrcSerde.java +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tajo.storage.thirdparty.orc; - -import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriter; -import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpressionWriterFactory; -import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.hadoop.io.ObjectWritable; -import org.apache.hadoop.io.Writable; - -/** - * A serde class for ORC. - * It transparently passes the object to/from the ORC file reader/writer. - */ -public class VectorizedOrcSerde extends OrcSerde { - private final OrcStruct[] orcStructArray = new OrcStruct[VectorizedRowBatch.DEFAULT_SIZE]; - private final Writable [] orcRowArray = new Writable [VectorizedRowBatch.DEFAULT_SIZE]; - private final ObjectWritable ow = new ObjectWritable(); - private final ObjectInspector inspector = null; - private final VectorExpressionWriter[] valueWriters; - - public VectorizedOrcSerde(ObjectInspector objInspector) { - super(); - for (int i = 0; i < orcStructArray.length; i++) { - orcRowArray[i] = new OrcSerde.OrcSerdeRow(); - } - try { - valueWriters = VectorExpressionWriterFactory - .getExpressionWriters((StructObjectInspector) objInspector); - } catch (HiveException e) { - throw new RuntimeException(e); - } - } - - - @Override - public Writable serialize(Object obj, ObjectInspector inspector) { - VectorizedRowBatch batch = (VectorizedRowBatch) obj; - try { - for (int i = 0; i < batch.size; i++) { - OrcStruct ost = orcStructArray[i]; - if (ost == null) { - ost = new OrcStruct(batch.numCols); - orcStructArray[i] = ost; - } - int index = 0; - if (batch.selectedInUse) { - index = batch.selected[i]; - } else { - index = i; - } - for (int p = 0; p < batch.projectionSize; p++) { - int k = batch.projectedColumns[p]; - if (batch.cols[k].isRepeating) { - valueWriters[p].setValue(ost, batch.cols[k], 0); - } else { - valueWriters[p].setValue(ost, batch.cols[k], index); - } - } - OrcSerde.OrcSerdeRow row = (OrcSerde.OrcSerdeRow) orcRowArray[i]; - row.realRow = ost; - row.inspector = inspector; - } - } catch (HiveException ex) { - throw new RuntimeException(ex); - } - ow.set(orcRowArray); - return ow; - } -} From fd63cc754f9f234a06e1f4dbe67875231e3c4fa6 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 24 Jul 2015 15:53:11 +0900 Subject: [PATCH 119/141] HIVE-11122: ORC should not record the timezone information when there are no timestamp columns --- .../tajo/storage/thirdparty/orc/WriterImpl.java | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index f9a5e8d680..6167dbe368 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -123,6 +123,7 @@ public class WriterImpl implements Writer, MemoryManager.Callback { private final OrcFile.CompressionStrategy compressionStrategy; private final boolean[] bloomFilterColumns; private final double bloomFilterFpp; + private boolean writeTimeZone; WriterImpl(FileSystem fs, Path path, @@ -587,6 +588,14 @@ public Configuration getConfiguration() { public OrcFile.Version getVersion() { return version; } + + public void useWriterTimeZone(boolean val) { + writeTimeZone = val; + } + + public boolean hasWriterTimeZone() { + return writeTimeZone; + } } /** @@ -616,6 +625,7 @@ private abstract static class TreeWriter { private boolean foundNulls; private OutStream isPresentOutStream; private final List stripeStatsBuilders; + private final StreamFactory streamFactory; /** * Create a tree writer. @@ -628,6 +638,7 @@ private abstract static class TreeWriter { TreeWriter(int columnId, ObjectInspector inspector, StreamFactory streamFactory, boolean nullable) throws IOException { + this.streamFactory = streamFactory; this.isCompressed = streamFactory.isCompressed(); this.id = columnId; this.inspector = inspector; @@ -786,7 +797,9 @@ void writeStripe(OrcProto.StripeFooter.Builder builder, foundNulls = false; builder.addColumns(getEncoding()); - builder.setWriterTimezone(TimeZone.getDefault().getID()); + if (streamFactory.hasWriterTimeZone()) { + builder.setWriterTimezone(TimeZone.getDefault().getID()); + } if (rowIndexStream != null) { if (rowIndex.getEntryCount() != requiredIndexEntries) { throw new IllegalArgumentException("Column has wrong number of " + @@ -1466,6 +1479,7 @@ private static class TimestampTreeWriter extends TreeWriter { recordPosition(rowIndexPosition); // for unit tests to set different time zones this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / MILLIS_PER_SECOND; + writer.useWriterTimeZone(true); } @Override From fba5177a15f61dd3b2784802d4f149682da17369 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Fri, 24 Jul 2015 16:06:28 +0900 Subject: [PATCH 120/141] HIVE-10191: Remove per-row synchronization from ORC WriterImpl --- .../storage/thirdparty/orc/MemoryManager.java | 36 ++++++++++++--- .../tajo/storage/thirdparty/orc/OrcFile.java | 13 ++++-- .../storage/thirdparty/orc/WriterImpl.java | 45 ++++++++++--------- 3 files changed, 62 insertions(+), 32 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MemoryManager.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MemoryManager.java index d29e314a85..8cd40f73e9 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MemoryManager.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/MemoryManager.java @@ -18,6 +18,7 @@ package org.apache.tajo.storage.thirdparty.orc; +import com.google.common.base.Preconditions; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -28,6 +29,7 @@ import java.lang.management.ManagementFactory; import java.util.HashMap; import java.util.Map; +import java.util.concurrent.locks.ReentrantLock; /** * Implements a memory manager that keeps a global context of how many ORC @@ -36,8 +38,8 @@ * By managing the size of each allocation, we try to cut down the size of each * allocation and keep the task from running out of memory. * - * This class is thread safe and uses synchronization around the shared state - * to prevent race conditions. + * This class is not thread safe, but is re-entrant - ensure creation and all + * invocations are triggered from the same thread. */ class MemoryManager { @@ -54,6 +56,14 @@ class MemoryManager { private long totalAllocation = 0; private double currentScale = 1; private int rowsAddedSinceCheck = 0; + private final OwnedLock ownerLock = new OwnedLock(); + + @SuppressWarnings("serial") + private static class OwnedLock extends ReentrantLock { + public Thread getOwner() { + return super.getOwner(); + } + } private static class WriterInfo { long allocation; @@ -84,6 +94,17 @@ public interface Callback { double maxLoad = conf.getFloat(poolVar.varname, poolVar.defaultFloatVal); totalMemoryPool = Math.round(ManagementFactory.getMemoryMXBean(). getHeapMemoryUsage().getMax() * maxLoad); + ownerLock.lock(); + } + + /** + * Light weight thread-safety check for multi-threaded access patterns + */ + private void checkOwner() { + Preconditions.checkArgument(ownerLock.isHeldByCurrentThread(), + "Owner thread expected %s, got %s", + ownerLock.getOwner(), + Thread.currentThread()); } /** @@ -92,8 +113,9 @@ public interface Callback { * @param path the file that is being written * @param requestedAllocation the requested buffer size */ - synchronized void addWriter(Path path, long requestedAllocation, + void addWriter(Path path, long requestedAllocation, Callback callback) throws IOException { + checkOwner(); WriterInfo oldVal = writerList.get(path); // this should always be null, but we handle the case where the memory // manager wasn't told that a writer wasn't still in use and the task @@ -115,7 +137,8 @@ synchronized void addWriter(Path path, long requestedAllocation, * Remove the given writer from the pool. * @param path the file that has been closed */ - synchronized void removeWriter(Path path) throws IOException { + void removeWriter(Path path) throws IOException { + checkOwner(); WriterInfo val = writerList.get(path); if (val != null) { writerList.remove(path); @@ -144,7 +167,7 @@ long getTotalMemoryPool() { * @return a fraction between 0.0 and 1.0 of the requested size that is * available for each writer. */ - synchronized double getAllocationScale() { + double getAllocationScale() { return currentScale; } @@ -152,7 +175,7 @@ synchronized double getAllocationScale() { * Give the memory manager an opportunity for doing a memory check. * @throws IOException */ - synchronized void addedRow() throws IOException { + void addedRow() throws IOException { if (++rowsAddedSinceCheck >= ROWS_BETWEEN_CHECKS) { notifyWriters(); } @@ -163,6 +186,7 @@ synchronized void addedRow() throws IOException { * @throws IOException */ void notifyWriters() throws IOException { + checkOwner(); LOG.debug("Notifying writers after " + rowsAddedSinceCheck); for(WriterInfo writer: writerList.values()) { boolean flushed = writer.callback.checkMemory(currentScale); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java index e49c03af90..3bbd81411a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java @@ -431,13 +431,18 @@ public static Writer createWriter(FileSystem fs, .rowIndexStride(rowIndexStride)); } - private static MemoryManager memoryManager = null; + private static ThreadLocal memoryManager = null; - private static synchronized MemoryManager getMemoryManager(Configuration conf) { + private static synchronized MemoryManager getMemoryManager(final Configuration conf) { if (memoryManager == null) { - memoryManager = new MemoryManager(conf); + memoryManager = new ThreadLocal() { + @Override + protected MemoryManager initialValue() { + return new MemoryManager(conf); + } + }; } - return memoryManager; + return memoryManager.get(); } } diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index 6167dbe368..e6ae38cc6c 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -67,9 +67,14 @@ * sub-types. Each of the TreeWriters writes the column's data as a set of * streams. * - * This class is synchronized so that multi-threaded access is ok. In - * particular, because the MemoryManager is shared between writers, this class - * assumes that checkMemory may be called from a separate thread. + * This class is unsynchronized like most Stream objects, so from the creation of an OrcFile and all + * access to a single instance has to be from a single thread. + * + * There are no known cases where these happen between different threads today. + * + * Caveat: the MemoryManager is created during WriterOptions create, that has to be confined to a single + * thread as well. + * */ public class WriterImpl implements Writer, MemoryManager.Callback { @@ -313,7 +318,7 @@ public static CompressionCodec createCodec(CompressionKind kind) { } @Override - public synchronized boolean checkMemory(double newScale) throws IOException { + public boolean checkMemory(double newScale) throws IOException { long limit = (long) Math.round(adjustedStripeSize * newScale); long size = estimateStripeSize(); if (LOG.isDebugEnabled()) { @@ -2089,20 +2094,18 @@ private long estimateStripeSize() { } @Override - public synchronized void addUserMetadata(String name, ByteBuffer value) { + public void addUserMetadata(String name, ByteBuffer value) { userMetadata.put(name, ByteString.copyFrom(value)); } public void addTuple(Tuple tuple) throws IOException { - synchronized (this) { - ((StructTreeWriter)treeWriter).writeTuple(tuple); - rowsInStripe += 1; - if (buildIndex) { - rowsInIndex += 1; - - if (rowsInIndex >= rowIndexStride) { - createRowIndexEntry(); - } + ((StructTreeWriter)treeWriter).writeTuple(tuple); + rowsInStripe += 1; + if (buildIndex) { + rowsInIndex += 1; + + if (rowsInIndex >= rowIndexStride) { + createRowIndexEntry(); } } memoryManager.addedRow(); @@ -2116,13 +2119,11 @@ public void close() throws IOException { // remove us from the memory manager so that we don't get any callbacks memoryManager.removeWriter(path); // actually close the file - synchronized (this) { - flushStripe(); - int metadataLength = writeMetadata(rawWriter.getPos()); - int footerLength = writeFooter(rawWriter.getPos() - metadataLength); - rawWriter.writeByte(writePostScript(footerLength, metadataLength)); - rawWriter.close(); - } + flushStripe(); + int metadataLength = writeMetadata(rawWriter.getPos()); + int footerLength = writeFooter(rawWriter.getPos() - metadataLength); + rawWriter.writeByte(writePostScript(footerLength, metadataLength)); + rawWriter.close(); } /** @@ -2144,7 +2145,7 @@ public long getNumberOfRows() { } @Override - public synchronized long writeIntermediateFooter() throws IOException { + public long writeIntermediateFooter() throws IOException { // flush any buffered rows flushStripe(); // write a footer From 97d6f68b83050826f846824a51c39ec2d5bf5c73 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Tue, 28 Jul 2015 23:19:16 +0900 Subject: [PATCH 121/141] WriterTimeZone is from TableMeta --- .../apache/tajo/storage/orc/ORCAppender.java | 10 ++++++++-- .../tajo/storage/thirdparty/orc/OrcFile.java | 18 +++++++++++++++--- .../storage/thirdparty/orc/WriterImpl.java | 11 +++++++++-- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java index f01d30cdc0..4544ed34d4 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCAppender.java @@ -20,6 +20,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; +import org.apache.tajo.TajoConstants; import org.apache.tajo.TaskAttemptId; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; @@ -34,15 +35,19 @@ import org.apache.tajo.storage.thirdparty.orc.Writer; import java.io.IOException; +import java.util.TimeZone; public class ORCAppender extends FileAppender { private Writer writer; private TableStatistics stats; - + private TimeZone timezone; public ORCAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, TableMeta meta, Path workDir) { super(conf, taskAttemptId, schema, meta, workDir); + + timezone = TimeZone.getTimeZone(meta.getOption(StorageConstants.TIMEZONE, + TajoConstants.DEFAULT_SYSTEM_TIMEZONE)); } @Override @@ -54,7 +59,8 @@ public void init() throws IOException { Integer.parseInt(meta.getOption(StorageConstants.ORC_BUFFER_SIZE, StorageConstants.DEFAULT_ORC_BUFFER_SIZE)), Integer.parseInt(meta.getOption(StorageConstants.ORC_ROW_INDEX_STRIDE, - StorageConstants.DEFAULT_ORC_ROW_INDEX_STRIDE))); + StorageConstants.DEFAULT_ORC_ROW_INDEX_STRIDE)), + timezone); if (enabledStats) { this.stats = new TableStatistics(schema); diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java index 3bbd81411a..a291953981 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/OrcFile.java @@ -26,6 +26,7 @@ import static org.apache.tajo.storage.thirdparty.orc.OrcConf.ConfVars.*; import java.io.IOException; +import java.util.TimeZone; /** * Contains factory methods to read or write ORC files. @@ -195,6 +196,7 @@ public static class WriterOptions { private float paddingTolerance; private String bloomFilterColumns; private double bloomFilterFpp; + private TimeZone timezone; WriterOptions(Configuration conf) { configuration = conf; @@ -366,6 +368,13 @@ WriterOptions memory(MemoryManager value) { return this; } + /** + * Tajo-specific + */ + WriterOptions timezone(TimeZone value) { + timezone = value; + return this; + } } /** @@ -396,7 +405,8 @@ public static Writer createWriter(Path path, opts.versionValue, opts.callback, opts.encodingStrategy, opts.compressionStrategy, opts.paddingTolerance, opts.blockSizeValue, - opts.bloomFilterColumns, opts.bloomFilterFpp); + opts.bloomFilterColumns, opts.bloomFilterFpp, + opts.timezone); } /** @@ -420,7 +430,8 @@ public static Writer createWriter(FileSystem fs, long stripeSize, CompressionKind compress, int bufferSize, - int rowIndexStride) throws IOException { + int rowIndexStride, + TimeZone timeZone) throws IOException { return createWriter(path, writerOptions(conf) .fileSystem(fs) @@ -428,7 +439,8 @@ public static Writer createWriter(FileSystem fs, .stripeSize(stripeSize) .compress(compress) .bufferSize(bufferSize) - .rowIndexStride(rowIndexStride)); + .rowIndexStride(rowIndexStride) + .timezone(timeZone)); } private static ThreadLocal memoryManager = null; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index e6ae38cc6c..946082952d 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -129,6 +129,7 @@ public class WriterImpl implements Writer, MemoryManager.Callback { private final boolean[] bloomFilterColumns; private final double bloomFilterFpp; private boolean writeTimeZone; + private TimeZone timeZone; WriterImpl(FileSystem fs, Path path, @@ -147,7 +148,8 @@ public class WriterImpl implements Writer, MemoryManager.Callback { float paddingTolerance, long blockSizeValue, String bloomFilterColumnNames, - double bloomFilterFpp) throws IOException { + double bloomFilterFpp, + TimeZone timeZone) throws IOException { this.fs = fs; this.path = path; this.conf = conf; @@ -174,6 +176,7 @@ public Writer getWriter() { this.compress = compress; this.rowIndexStride = rowIndexStride; this.memoryManager = memoryManager; + this.timeZone = timeZone; buildIndex = rowIndexStride > 0; codec = createCodec(compress); String allColumns = conf.get(IOConstants.COLUMNS); @@ -601,6 +604,10 @@ public void useWriterTimeZone(boolean val) { public boolean hasWriterTimeZone() { return writeTimeZone; } + + public TimeZone getTimeZone() { + return timeZone; + } } /** @@ -803,7 +810,7 @@ void writeStripe(OrcProto.StripeFooter.Builder builder, builder.addColumns(getEncoding()); if (streamFactory.hasWriterTimeZone()) { - builder.setWriterTimezone(TimeZone.getDefault().getID()); + builder.setWriterTimezone(streamFactory.getTimeZone().getID()); } if (rowIndexStream != null) { if (rowIndex.getEntryCount() != requiredIndexEntries) { From e31a1b2f2a959809dd2130f7596d13c7d9cc4fca Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Tue, 28 Jul 2015 23:36:29 +0900 Subject: [PATCH 122/141] some package-info.javas added --- .../apache/tajo/storage/orc/package-info.java | 95 +++++++++++++++++++ .../storage/thirdparty/orc/package-info.java | 27 ++++++ 2 files changed, 122 insertions(+) create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/package-info.java create mode 100644 tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/package-info.java diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/package-info.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/package-info.java new file mode 100644 index 0000000000..a987bb9603 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/package-info.java @@ -0,0 +1,95 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + *

+ * Provides read and write support for ORC files. Tajo schemas are + * converted to ORC struct type + *

+ * + *
+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
Tajo typeORC type
NULL_TYPEBOOLEAN, but all fields are marked as null
BOOLEANBOOLEAN
BYTEBYTE
INT2SHORT
INT4INTEGER
INT8LONG
FLOAT4FLOAT
FLOAT8DOUBLE
CHAR/TEXTSTRING
BLOB/PROTOBUFBINARY
INET4INTEGER
TIMESTAMPTIMESTAMP
DATEDATE
+ * + *

+ * Because Tajo fields can be NULL, all ORC fields are marked as optional. + *

+ * + *

+ * The conversion from Tajo to ORC is lossy without the original Tajo + * schema. As a result, ORC files are read using the Tajo schema saved in + * the Tajo catalog for the table the ORC files belong to, which was + * defined when the table was created. + *

+ */ + +package org.apache.tajo.storage.orc; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/package-info.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/package-info.java new file mode 100644 index 0000000000..1c97124766 --- /dev/null +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/package-info.java @@ -0,0 +1,27 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +/** + *

+ * Provides read and write support for ORC files. + * Source files in this packages are from Hive. + * But, some files are modified for supporting the concept of Tajo Tuple and Datum. + * One of representative files is WriterImpl.java. Others are almost same as ones in Hive. + *

+ */ + +package org.apache.tajo.storage.thirdparty.orc; From 0d7805ca4316ca9c7b93ae5e47973f1c0598c49a Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 29 Jul 2015 18:13:20 +0900 Subject: [PATCH 123/141] Remove useless test file --- .../tajo/storage/orc/TestORCScanner.java | 107 ------------------ 1 file changed, 107 deletions(-) delete mode 100644 tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestORCScanner.java diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestORCScanner.java deleted file mode 100644 index b4117931fe..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestORCScanner.java +++ /dev/null @@ -1,107 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.tajo.catalog.Schema; -import org.apache.tajo.catalog.TableMeta; -import org.apache.tajo.common.TajoDataTypes; -import org.apache.tajo.conf.TajoConf; -import org.apache.tajo.datum.TimestampDatum; -import org.apache.tajo.storage.Tuple; -import org.apache.tajo.storage.fragment.FileFragment; -import org.apache.tajo.storage.fragment.Fragment; -import org.apache.tajo.util.KeyValueSet; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import static org.junit.Assert.*; - -import java.io.IOException; -import java.net.URL; - -public class TestORCScanner { - private ORCScanner orcScanner; - - public static Path getResourcePath(String path, String suffix) { - URL resultBaseURL = ClassLoader.getSystemResource(path); - return new Path(resultBaseURL.toString(), suffix); - } - - private static FileFragment getFileFragment(Configuration conf, String fileName) throws IOException { - Path tablePath = new Path(getResourcePath("dataset", "."), fileName); - FileSystem fs = FileSystem.getLocal(conf); - FileStatus status = fs.getFileStatus(tablePath); - return new FileFragment("table", tablePath, 0, status.getLen()); - } - - @Before - public void setup() throws IOException { - Schema schema = new Schema(); - schema.addColumn("userid", TajoDataTypes.Type.INT4); - schema.addColumn("movieid", TajoDataTypes.Type.INT4); - schema.addColumn("rating", TajoDataTypes.Type.INT2); - schema.addColumn("unixtimestamp", TajoDataTypes.Type.TEXT); - schema.addColumn("faketime", TajoDataTypes.Type.TIMESTAMP); - - Configuration conf = new TajoConf(); - - TableMeta meta = new TableMeta("ORC", new KeyValueSet()); - - Fragment fragment = getFileFragment(conf, "u_data_20.orc"); - - orcScanner = new ORCScanner(conf, schema, meta, fragment); - - orcScanner.init(); - } - - @Test - public void testReadTuple() { - try { - Tuple tuple = orcScanner.next(); - - assertEquals(tuple.getInt4(0), 196); - assertEquals(tuple.getInt4(1), 242); - assertEquals(tuple.getInt2(2), 3); - assertEquals(tuple.getText(3), "881250949"); - - // Timestamp test - TimestampDatum timestamp = (TimestampDatum)tuple.asDatum(4); - - assertEquals(timestamp.getYear(), 2008); - assertEquals(timestamp.getMonthOfYear(), 12); - assertEquals(timestamp.getDayOfMonth(), 12); - } catch (IOException e) { - e.printStackTrace(); - } - } - - @After - public void end() { - try { - orcScanner.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } -} \ No newline at end of file From 17732092dfd7f2d8d7512d2e4905ad9922e98554 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 29 Jul 2015 18:28:16 +0900 Subject: [PATCH 124/141] Timezone code in ORCScanner is also modified --- .../java/org/apache/tajo/storage/orc/ORCScanner.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java index 9b6b0ab0a7..a158f36521 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java @@ -25,6 +25,7 @@ import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.tajo.TajoConstants; import org.apache.tajo.catalog.Schema; import org.apache.tajo.catalog.TableMeta; import org.apache.tajo.common.TajoDataTypes; @@ -46,6 +47,7 @@ import java.io.IOException; import java.util.HashSet; import java.util.Set; +import java.util.TimeZone; /** * OrcScanner for reading ORC files @@ -152,11 +154,13 @@ public void init() throws IOException { orcReader = new OrcReader(orcDataSource, new OrcMetadataReader()); + TimeZone timezone = TimeZone.getTimeZone(meta.getOption(StorageConstants.TIMEZONE, + TajoConstants.DEFAULT_SYSTEM_TIMEZONE)); + // TODO: make OrcPredicate useful - // TODO: TimeZone should be from conf - // TODO: it might be splittable + // presto-orc uses joda timezone, so it needs to be converted. recordReader = orcReader.createRecordReader(columnSet, OrcPredicate.TRUE, - fragment.getStartKey(), fragment.getLength(), DateTimeZone.getDefault()); + fragment.getStartKey(), fragment.getLength(), DateTimeZone.forTimeZone(timezone)); LOG.debug("file fragment { path: " + fragment.getPath() + ", start offset: " + fragment.getStartKey() + @@ -185,7 +189,6 @@ public Tuple next() throws IOException { return outTuple; } - // TODO: support more types private Datum createValueDatum(Vector vector, TajoDataTypes.DataType type) { switch (type.getType()) { case INT1: From bc31a48e476c4bdcc3a8a23fb3d976a9ff79ec3d Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 19 Aug 2015 12:32:56 +0900 Subject: [PATCH 125/141] includeTimezone flag is for TIMESTAMPZ type, so changed to 'false' --- .../tajo/storage/text/TextFieldSerializerDeserializer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java index 48aaf29c6d..d9d2639848 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/text/TextFieldSerializerDeserializer.java @@ -124,7 +124,7 @@ public int serialize(int columnIndex, Tuple tuple, OutputStream out, byte[] null break; case TIME: if (hasTimezone) { - bytes = TimeDatum.asChars(tuple.getTimeDate(columnIndex), timezone, true).getBytes(Bytes.UTF8_CHARSET); + bytes = TimeDatum.asChars(tuple.getTimeDate(columnIndex), timezone, false).getBytes(Bytes.UTF8_CHARSET); } else { bytes = tuple.getTextBytes(columnIndex); } @@ -133,7 +133,7 @@ public int serialize(int columnIndex, Tuple tuple, OutputStream out, byte[] null break; case TIMESTAMP: if (hasTimezone) { - bytes = TimestampDatum.asChars(tuple.getTimeDate(columnIndex), timezone, true).getBytes(Bytes.UTF8_CHARSET); + bytes = TimestampDatum.asChars(tuple.getTimeDate(columnIndex), timezone, false).getBytes(Bytes.UTF8_CHARSET); } else { bytes = tuple.getTextBytes(columnIndex); } From 9b9df57198a1a91a5f881efbe250a5a35bf5ea35 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 19 Aug 2015 16:02:44 +0900 Subject: [PATCH 126/141] empty test table can be created --- .../src/test/java/org/apache/tajo/QueryTestCaseBase.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java index c3c9b52dac..e460d7b0f8 100644 --- a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java +++ b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java @@ -811,7 +811,8 @@ private Path getDataSetFile(String fileName) throws IOException { throw new IOException("Cannot find " + fileName + " at " + currentQueryPath + " and " + namedQueryPath); } } else { - throw new IOException("Cannot find " + fileName + " at " + currentQueryPath + " and " + namedQueryPath); + // to make empty table (used to insert test) + fs.mkdirs(dataFilePath); } } return dataFilePath; From 56509700d08ee8d2d6819d30c32794aa889fe21c Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 19 Aug 2015 17:23:18 +0900 Subject: [PATCH 127/141] Timezone applied when data is written --- .../apache/tajo/storage/thirdparty/orc/WriterImpl.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java index 946082952d..e520082b9a 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/WriterImpl.java @@ -1477,6 +1477,7 @@ private static class TimestampTreeWriter extends TreeWriter { private final IntegerWriter nanos; private final boolean isDirectV2; private final long base_timestamp; + private TimeZone timeZone; TimestampTreeWriter(int columnId, ObjectInspector inspector, @@ -1492,6 +1493,7 @@ private static class TimestampTreeWriter extends TreeWriter { // for unit tests to set different time zones this.base_timestamp = Timestamp.valueOf(BASE_TIMESTAMP_STRING).getTime() / MILLIS_PER_SECOND; writer.useWriterTimeZone(true); + timeZone = writer.getTimeZone(); } @Override @@ -1508,7 +1510,12 @@ OrcProto.ColumnEncoding getEncoding() { void write(Datum datum) throws IOException { super.write(datum); if (datum != null && datum.isNotNull()) { - Timestamp val = new Timestamp(DateTimeUtil.julianTimeToJavaTime(datum.asInt8())); + long javaTimestamp = DateTimeUtil.julianTimeToJavaTime(datum.asInt8()); + + // revise timestamp value depends on timezone + javaTimestamp += timeZone.getRawOffset(); + + Timestamp val = new Timestamp(javaTimestamp); indexStatistics.updateTimestamp(val); seconds.write((val.getTime() / MILLIS_PER_SECOND) - base_timestamp); nanos.write(formatNanos(val.getNanos())); From b70cf6fc91aaac8eaeecd33d871c906585607cb2 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 19 Aug 2015 17:25:55 +0900 Subject: [PATCH 128/141] Added an unit test --- .../tajo/engine/query/TestSelectQuery.java | 18 ++++++++++++++++++ .../datetime_table_timezoned_orc_ddl.sql | 4 ++++ .../TestSelectQuery/testTimezonedORCTable.sql | 2 ++ 3 files changed, 24 insertions(+) create mode 100644 tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql create mode 100644 tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java index 27c2fcd25f..f3ca9f6237 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java @@ -658,6 +658,24 @@ public void testTimezonedTable5() throws Exception { testingCluster.getConfiguration().setSystemTimezone(TimeZone.getTimeZone("GMT")); } } + + @Test + public void testTimeZonedORCTable() throws Exception { + try { + + executeDDL("datetime_table_timezoned_ddl.sql", "timezoned", "timezoned"); + executeDDL("datetime_table_timezoned_orc_ddl.sql", "timezoned_orc", "timezoned_orc"); + + executeString("INSERT OVERWRITE INTO timezoned_orc SELECT * FROM timezoned"); + + ResultSet res = executeQuery(); + assertResultSet(res, "testTimezonedTable3.result"); + cleanupQuery(res); + } finally { + executeString("DROP TABLE IF EXISTS timezoned"); + executeString("DROP TABLE IF EXISTS timezoned_orc"); + } + } @Test public void testMultiBytesDelimiter1() throws Exception { diff --git a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql b/tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql new file mode 100644 index 0000000000..80cd63fea1 --- /dev/null +++ b/tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql @@ -0,0 +1,4 @@ +CREATE EXTERNAL TABLE ${0} ( + t_timestamp TIMESTAMP, + t_date DATE +) USING ORC WITH ('timezone' = 'GMT+9') LOCATION ${table.path} diff --git a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql b/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql new file mode 100644 index 0000000000..1d898bd73c --- /dev/null +++ b/tajo-core-tests/src/test/resources/queries/TestSelectQuery/testTimezonedORCTable.sql @@ -0,0 +1,2 @@ +SET SESSION TIMEZONE = 'GMT+9'; +SELECT * FROM timezoned_orc; \ No newline at end of file From 93429e5e2e109918f2297cde259ae64270b7a5ca Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 20 Aug 2015 17:15:48 +0900 Subject: [PATCH 129/141] unit test fixed --- .../java/org/apache/tajo/engine/query/TestSelectQuery.java | 6 +++--- .../TestSelectQuery/datetime_table_timezoned_orc_ddl.sql | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java index 16407c8b1c..e0ae099744 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java @@ -676,7 +676,7 @@ public void testLoadIntoTimezonedTable() throws Exception { try { executeDDL("datetime_table_timezoned_ddl.sql", "timezoned", "timezoned_load1"); executeDDL("datetime_table_timezoned_ddl2.sql", null, "timezoned_load2"); - executeString("insert overwrite into timezoned_load2 select * from timezoned_load1"); + executeString("INSERT OVERWRITE INTO timezoned_load2 SELECT * FROM timezoned_load1"); ResultSet res = executeQuery(); assertResultSet(res, "testTimezonedTable3.result"); @@ -693,7 +693,7 @@ public void testTimeZonedORCTable() throws Exception { try { executeDDL("datetime_table_timezoned_ddl.sql", "timezoned", "timezoned"); - executeDDL("datetime_table_timezoned_orc_ddl.sql", "timezoned_orc", "timezoned_orc"); + executeDDL("datetime_table_timezoned_orc_ddl.sql", null, "timezoned_orc"); executeString("INSERT OVERWRITE INTO timezoned_orc SELECT * FROM timezoned"); @@ -702,7 +702,7 @@ public void testTimeZonedORCTable() throws Exception { cleanupQuery(res); } finally { executeString("DROP TABLE IF EXISTS timezoned"); - executeString("DROP TABLE IF EXISTS timezoned_orc"); + executeString("DROP TABLE IF EXISTS timezoned_orc PURGE"); } } diff --git a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql b/tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql index 80cd63fea1..49e1f7e8e6 100644 --- a/tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql +++ b/tajo-core-tests/src/test/resources/queries/TestSelectQuery/datetime_table_timezoned_orc_ddl.sql @@ -1,4 +1,4 @@ -CREATE EXTERNAL TABLE ${0} ( +CREATE TABLE ${0} ( t_timestamp TIMESTAMP, t_date DATE -) USING ORC WITH ('timezone' = 'GMT+9') LOCATION ${table.path} +) USING ORC WITH ('timezone' = 'GMT+9') From 049cc856f1ecaeebe7c3312a4834401b36c3de93 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 20 Aug 2015 17:50:28 +0900 Subject: [PATCH 130/141] Method name modified --- .../test/java/org/apache/tajo/engine/query/TestSelectQuery.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java index e0ae099744..f956c41cec 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java @@ -689,7 +689,7 @@ public void testLoadIntoTimezonedTable() throws Exception { } @Test - public void testTimeZonedORCTable() throws Exception { + public void testTimezonedORCTable() throws Exception { try { executeDDL("datetime_table_timezoned_ddl.sql", "timezoned", "timezoned"); From e3e8acacd197cbcdb118496e35a07cccd28963d4 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Thu, 20 Aug 2015 18:26:18 +0900 Subject: [PATCH 131/141] unit test fixed #2 --- .../test/java/org/apache/tajo/engine/query/TestSelectQuery.java | 1 + 1 file changed, 1 insertion(+) diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java index f956c41cec..d7ec60fb7b 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java @@ -699,6 +699,7 @@ public void testTimezonedORCTable() throws Exception { ResultSet res = executeQuery(); assertResultSet(res, "testTimezonedTable3.result"); + executeString("SET TIME ZONE 'GMT'"); cleanupQuery(res); } finally { executeString("DROP TABLE IF EXISTS timezoned"); From 2a821d6cbedd0d3efcbba34750cce8a9c1c33498 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Tue, 1 Sep 2015 14:41:13 +0900 Subject: [PATCH 132/141] Error processing way is refined --- .../org/apache/tajo/datum/DatumFactory.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java b/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java index 824d57b504..8fa523b53a 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java @@ -18,11 +18,11 @@ package org.apache.tajo.datum; -import com.google.protobuf.Message; import org.apache.commons.codec.binary.Base64; import org.apache.tajo.common.TajoDataTypes.DataType; import org.apache.tajo.common.TajoDataTypes.Type; -import org.apache.tajo.exception.InvalidCastException; +import org.apache.tajo.error.Errors; +import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.util.NumberUtil; import org.apache.tajo.util.datetime.DateTimeFormat; import org.apache.tajo.util.datetime.DateTimeUtil; @@ -71,7 +71,7 @@ public static Class getDatumClass(Type type) { case NULL_TYPE: return NullDatum.class; default: - throw new UnsupportedOperationException(type.name()); + throw new TajoRuntimeException(Errors.ResultCode.UNSUPPORTED_DATATYPE, type.name()); } } @@ -107,7 +107,7 @@ public static Datum createFromString(DataType dataType, String value) { case INET4: return createInet4(value); default: - throw new UnsupportedOperationException(dataType.toString()); + throw new TajoRuntimeException(Errors.ResultCode.UNSUPPORTED_DATATYPE, dataType.toString()); } } @@ -147,10 +147,10 @@ public static Datum createFromBytes(DataType dataType, byte[] bytes) { return ProtobufDatumFactory.createDatum(dataType, bytes); } catch (IOException e) { e.printStackTrace(); - throw new RuntimeException(e); + throw new TajoRuntimeException(Errors.ResultCode.IO_ERROR, e.getMessage()); } default: - throw new UnsupportedOperationException(dataType.toString()); + throw new TajoRuntimeException(Errors.ResultCode.UNSUPPORTED_DATATYPE, dataType.toString()); } } @@ -161,7 +161,7 @@ public static Datum createFromInt4(DataType type, int val) { case DATE: return new DateDatum(val); default: - throw new UnsupportedOperationException("Cannot create " + type.getType().name() + " datum from INT4"); + throw new TajoRuntimeException(Errors.ResultCode.UNSUPPORTED_DATATYPE, type.getType().name()); } } @@ -174,7 +174,7 @@ public static Datum createFromInt8(DataType type, long val) { case TIME: return createTime(val); default: - throw new UnsupportedOperationException("Cannot create " + type.getType().name() + " datum from INT8"); + throw new TajoRuntimeException(Errors.ResultCode.UNSUPPORTED_DATATYPE, type.getType().name()); } } @@ -330,7 +330,7 @@ public static DateDatum createDate(Datum datum) { case DATE: return (DateDatum) datum; default: - throw new InvalidCastException(datum.type(), Type.DATE); + throw new TajoRuntimeException(Errors.ResultCode.INVALID_CAST, datum.type().toString(), Type.DATE.name()); } } @@ -349,7 +349,7 @@ public static TimeDatum createTime(Datum datum, @Nullable TimeZone tz) { case TIME: return (TimeDatum) datum; default: - throw new InvalidCastException(datum.type(), Type.TIME); + throw new TajoRuntimeException(Errors.ResultCode.INVALID_CAST, datum.type().name(), Type.TIME.name()); } } @@ -362,7 +362,7 @@ public static TimestampDatum createTimestamp(Datum datum, @Nullable TimeZone tz) case TIMESTAMP: return (TimestampDatum) datum; default: - throw new InvalidCastException(datum.type(), Type.TIMESTAMP); + throw new TajoRuntimeException(Errors.ResultCode.INVALID_CAST, datum.type().name(), Type.TIMESTAMP.name()); } } @@ -459,7 +459,7 @@ public static Datum cast(Datum operandDatum, DataType target, @Nullable TimeZone case ANY: return DatumFactory.createAny(operandDatum); default: - throw new InvalidCastException(operandDatum.type(), target.getType()); + throw new TajoRuntimeException(Errors.ResultCode.INVALID_CAST, operandDatum.type().name(), target.getType().name()); } } } From f94fc583cbf2204438458d7918d073575cc61df1 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Tue, 1 Sep 2015 16:54:10 +0900 Subject: [PATCH 133/141] Added explanation for updating files modified in Tajo --- .../storage/thirdparty/orc/ColumnStatisticsImpl.java | 2 -- .../tajo/storage/thirdparty/orc/package-info.java | 12 +++++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java index 8742db1c8e..d74f9893b3 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/ColumnStatisticsImpl.java @@ -21,8 +21,6 @@ import org.apache.hadoop.hive.serde2.io.DateWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; -import org.apache.tajo.datum.BlobDatum; -import org.apache.tajo.datum.DateDatum; import org.apache.tajo.datum.Datum; import java.sql.Date; diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/package-info.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/package-info.java index 1c97124766..ae4841b478 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/package-info.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/thirdparty/orc/package-info.java @@ -20,7 +20,17 @@ * Provides read and write support for ORC files. * Source files in this packages are from Hive. * But, some files are modified for supporting the concept of Tajo Tuple and Datum. - * One of representative files is WriterImpl.java. Others are almost same as ones in Hive. + * Followings are listing of modified files. When updating library, be careful for + * the files. + * + * + * ColumnStatisticsImpl.java + * FileOrcDataSource.java + * HdfsOrcDataSource.java + * OrcConf.java + * OrcFile.java + * Writer.java + * WriterImpl.java *

*/ From 984e3a73d8dcd7ddaca9fbb177c9f645df60b3c8 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Tue, 8 Sep 2015 14:49:27 +0900 Subject: [PATCH 134/141] unit test fixed --- .../java/org/apache/tajo/engine/query/TestSelectQuery.java | 4 ++-- .../results/TestSelectQuery/testTimezonedORCTable.result | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) create mode 100644 tajo-core-tests/src/test/resources/results/TestSelectQuery/testTimezonedORCTable.result diff --git a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java index 649ecf3323..130b02d22d 100644 --- a/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java +++ b/tajo-core-tests/src/test/java/org/apache/tajo/engine/query/TestSelectQuery.java @@ -690,10 +690,10 @@ public void testTimezonedORCTable() throws Exception { executeDDL("datetime_table_timezoned_ddl.sql", "timezoned", "timezoned"); executeDDL("datetime_table_timezoned_orc_ddl.sql", null, "timezoned_orc"); - executeString("INSERT OVERWRITE INTO timezoned_orc SELECT * FROM timezoned"); + executeString("INSERT OVERWRITE INTO timezoned_orc SELECT t_timestamp, t_date FROM timezoned"); ResultSet res = executeQuery(); - assertResultSet(res, "testTimezonedTable3.result"); + assertResultSet(res, "testTimezonedORCTable.result"); executeString("SET TIME ZONE 'GMT'"); cleanupQuery(res); } finally { diff --git a/tajo-core-tests/src/test/resources/results/TestSelectQuery/testTimezonedORCTable.result b/tajo-core-tests/src/test/resources/results/TestSelectQuery/testTimezonedORCTable.result new file mode 100644 index 0000000000..39f593b0c8 --- /dev/null +++ b/tajo-core-tests/src/test/resources/results/TestSelectQuery/testTimezonedORCTable.result @@ -0,0 +1,5 @@ +t_timestamp,t_date +------------------------------- +1980-04-01 01:50:30.01,1980-04-01 +1980-04-01 01:50:30,1980-04-01 +1980-04-01 01:50:30,1980-04-01 \ No newline at end of file From 194a190940385c707d72763642a022606e33e63a Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Sun, 13 Sep 2015 01:46:10 +0900 Subject: [PATCH 135/141] revert empty table handling --- .../src/test/java/org/apache/tajo/QueryTestCaseBase.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java index 763840fd64..5a912e509e 100644 --- a/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java +++ b/tajo-cluster-tests/src/test/java/org/apache/tajo/QueryTestCaseBase.java @@ -913,8 +913,7 @@ private Path getDataSetFile(String fileName) throws IOException { throw new IOException("Cannot find " + fileName + " at " + currentQueryPath + " and " + namedQueryPath); } } else { - // to make empty table (used to insert test) - fs.mkdirs(dataFilePath); + throw new IOException("Cannot find " + fileName + " at " + currentQueryPath + " and " + namedQueryPath); } } return dataFilePath; From 4531315b10e2020c24ab950a511bb856678a6059 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 16 Sep 2015 12:46:23 +0900 Subject: [PATCH 136/141] Error handling code refined --- .../java/org/apache/tajo/datum/DateDatum.java | 5 +- .../java/org/apache/tajo/datum/Datum.java | 27 +++--- .../org/apache/tajo/datum/DatumFactory.java | 93 ++++++++++--------- .../org/apache/tajo/datum/Float4Datum.java | 5 +- .../java/org/apache/tajo/datum/Int8Datum.java | 5 +- .../java/org/apache/tajo/datum/NullDatum.java | 3 +- .../java/org/apache/tajo/datum/TextDatum.java | 5 +- .../java/org/apache/tajo/datum/TimeDatum.java | 7 +- .../tajo/exception/InvalidCastException.java | 11 ++- .../engine/codegen/TajoGeneratorAdapter.java | 16 ++-- 10 files changed, 95 insertions(+), 82 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java index 7cf0896354..3355e67600 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/DateDatum.java @@ -23,6 +23,7 @@ import org.apache.tajo.common.TajoDataTypes.Type; import org.apache.tajo.exception.InvalidCastException; import org.apache.tajo.exception.InvalidOperationException; +import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.util.Bytes; import org.apache.tajo.util.datetime.DateTimeConstants.DateStyle; import org.apache.tajo.util.datetime.DateTimeFormat; @@ -167,12 +168,12 @@ public long asInt8() { @Override public float asFloat4() { - throw new InvalidCastException(); + throw new TajoRuntimeException(new InvalidCastException(Type.DATE, Type.FLOAT4)); } @Override public double asFloat8() { - throw new InvalidCastException(); + throw new TajoRuntimeException(new InvalidCastException(Type.DATE, Type.FLOAT8)); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/Datum.java b/tajo-common/src/main/java/org/apache/tajo/datum/Datum.java index e3e12956e8..ec4992bb83 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/Datum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/Datum.java @@ -24,6 +24,7 @@ import org.apache.tajo.conf.TajoConf.ConfVars; import org.apache.tajo.exception.InvalidCastException; import org.apache.tajo.exception.InvalidOperationException; +import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.json.CommonGsonHelper; import org.apache.tajo.json.GsonObject; import org.apache.tajo.util.datetime.TimeMeta; @@ -66,46 +67,48 @@ public boolean isNotNull() { } public boolean asBool() { - throw new InvalidCastException(type, Type.BOOLEAN); + throw new TajoRuntimeException(new InvalidCastException(type, Type.BOOLEAN)); } public byte asByte() { - throw new InvalidCastException(type, Type.BIT); + throw new TajoRuntimeException(new InvalidCastException(type, Type.BIT)); } public char asChar() { - throw new InvalidCastException(type, Type.CHAR); + throw new TajoRuntimeException(new InvalidCastException(type, Type.CHAR)); } public short asInt2() { - throw new InvalidCastException(type, Type.INT2); + throw new TajoRuntimeException(new InvalidCastException(type, Type.INT2)); } + public int asInt4() { - throw new InvalidCastException(type, Type.INT4); + throw new TajoRuntimeException(new InvalidCastException(type, Type.INT4)); } + public long asInt8() { - throw new InvalidCastException(type, Type.INT8); + throw new TajoRuntimeException(new InvalidCastException(type, Type.INT8)); } public byte [] asByteArray() { - throw new InvalidCastException(type, Type.BLOB); + throw new TajoRuntimeException(new InvalidCastException(type, Type.BLOB)); } public float asFloat4() { - throw new InvalidCastException(type, Type.FLOAT4); + throw new TajoRuntimeException(new InvalidCastException(type, Type.FLOAT4)); } public double asFloat8() { - throw new InvalidCastException(type, Type.FLOAT8); + throw new TajoRuntimeException(new InvalidCastException(type, Type.FLOAT8)); } public String asChars() { - throw new InvalidCastException(type, Type.TEXT); + throw new TajoRuntimeException(new InvalidCastException(type, Type.TEXT)); } // todo remove this public char [] asUnicodeChars() { - throw new InvalidCastException(type, Type.TEXT); + throw new TajoRuntimeException(new InvalidCastException(type, Type.TEXT)); } public byte[] asTextBytes() { @@ -113,7 +116,7 @@ public byte[] asTextBytes() { } public TimeMeta asTimeMeta() { - throw new InvalidCastException(type, Type.INT8); + throw new TajoRuntimeException(new InvalidCastException(type, Type.INT8)); } public boolean isNumeric() { diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java b/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java index 8fa523b53a..de873867d7 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/DatumFactory.java @@ -21,8 +21,10 @@ import org.apache.commons.codec.binary.Base64; import org.apache.tajo.common.TajoDataTypes.DataType; import org.apache.tajo.common.TajoDataTypes.Type; -import org.apache.tajo.error.Errors; +import org.apache.tajo.exception.InvalidCastException; +import org.apache.tajo.exception.TajoInternalError; import org.apache.tajo.exception.TajoRuntimeException; +import org.apache.tajo.exception.UnsupportedDataTypeException; import org.apache.tajo.util.NumberUtil; import org.apache.tajo.util.datetime.DateTimeFormat; import org.apache.tajo.util.datetime.DateTimeUtil; @@ -71,7 +73,7 @@ public static Class getDatumClass(Type type) { case NULL_TYPE: return NullDatum.class; default: - throw new TajoRuntimeException(Errors.ResultCode.UNSUPPORTED_DATATYPE, type.name()); + throw new TajoRuntimeException(new UnsupportedDataTypeException(type.name())); } } @@ -107,50 +109,49 @@ public static Datum createFromString(DataType dataType, String value) { case INET4: return createInet4(value); default: - throw new TajoRuntimeException(Errors.ResultCode.UNSUPPORTED_DATATYPE, dataType.toString()); + throw new TajoRuntimeException(new UnsupportedDataTypeException(dataType.toString())); } } public static Datum createFromBytes(DataType dataType, byte[] bytes) { switch (dataType.getType()) { - case BOOLEAN: - return createBool(bytes[0]); - case INT2: - return createInt2(NumberUtil.toShort(bytes)); - case INT4: - return createInt4(NumberUtil.toInt(bytes)); - case INT8: - return createInt8(NumberUtil.toLong(bytes)); - case FLOAT4: - return createFloat4(NumberUtil.toFloat(bytes)); - case FLOAT8: - return createFloat8(NumberUtil.toDouble(bytes)); - case CHAR: - return createChar(bytes); - case TEXT: - return createText(bytes); - case DATE: - return new DateDatum(NumberUtil.toInt(bytes)); - case TIME: - return new TimeDatum(NumberUtil.toLong(bytes)); - case TIMESTAMP: - return new TimestampDatum(NumberUtil.toLong(bytes)); - case BIT: - return createBit(bytes[0]); - case BLOB: - return createBlob(bytes); - case INET4: - return createInet4(bytes); - case PROTOBUF: - try { - return ProtobufDatumFactory.createDatum(dataType, bytes); - } catch (IOException e) { - e.printStackTrace(); - throw new TajoRuntimeException(Errors.ResultCode.IO_ERROR, e.getMessage()); - } - default: - throw new TajoRuntimeException(Errors.ResultCode.UNSUPPORTED_DATATYPE, dataType.toString()); + case BOOLEAN: + return createBool(bytes[0]); + case INT2: + return createInt2(NumberUtil.toShort(bytes)); + case INT4: + return createInt4(NumberUtil.toInt(bytes)); + case INT8: + return createInt8(NumberUtil.toLong(bytes)); + case FLOAT4: + return createFloat4(NumberUtil.toFloat(bytes)); + case FLOAT8: + return createFloat8(NumberUtil.toDouble(bytes)); + case CHAR: + return createChar(bytes); + case TEXT: + return createText(bytes); + case DATE: + return new DateDatum(NumberUtil.toInt(bytes)); + case TIME: + return new TimeDatum(NumberUtil.toLong(bytes)); + case TIMESTAMP: + return new TimestampDatum(NumberUtil.toLong(bytes)); + case BIT: + return createBit(bytes[0]); + case BLOB: + return createBlob(bytes); + case INET4: + return createInet4(bytes); + case PROTOBUF: + try { + return ProtobufDatumFactory.createDatum(dataType, bytes); + } catch (IOException e) { + throw new TajoInternalError(e); + } + default: + throw new TajoRuntimeException(new UnsupportedDataTypeException(dataType.toString())); } } @@ -161,7 +162,7 @@ public static Datum createFromInt4(DataType type, int val) { case DATE: return new DateDatum(val); default: - throw new TajoRuntimeException(Errors.ResultCode.UNSUPPORTED_DATATYPE, type.getType().name()); + throw new TajoRuntimeException(new UnsupportedDataTypeException(type.getType().name())); } } @@ -174,7 +175,7 @@ public static Datum createFromInt8(DataType type, long val) { case TIME: return createTime(val); default: - throw new TajoRuntimeException(Errors.ResultCode.UNSUPPORTED_DATATYPE, type.getType().name()); + throw new TajoRuntimeException(new UnsupportedDataTypeException(type.getType().name())); } } @@ -330,7 +331,7 @@ public static DateDatum createDate(Datum datum) { case DATE: return (DateDatum) datum; default: - throw new TajoRuntimeException(Errors.ResultCode.INVALID_CAST, datum.type().toString(), Type.DATE.name()); + throw new TajoRuntimeException(new InvalidCastException(datum.type(), Type.DATE)); } } @@ -349,7 +350,7 @@ public static TimeDatum createTime(Datum datum, @Nullable TimeZone tz) { case TIME: return (TimeDatum) datum; default: - throw new TajoRuntimeException(Errors.ResultCode.INVALID_CAST, datum.type().name(), Type.TIME.name()); + throw new TajoRuntimeException(new InvalidCastException(datum.type(), Type.TIME)); } } @@ -362,7 +363,7 @@ public static TimestampDatum createTimestamp(Datum datum, @Nullable TimeZone tz) case TIMESTAMP: return (TimestampDatum) datum; default: - throw new TajoRuntimeException(Errors.ResultCode.INVALID_CAST, datum.type().name(), Type.TIMESTAMP.name()); + throw new TajoRuntimeException(new InvalidCastException(datum.type(), Type.TIMESTAMP)); } } @@ -459,7 +460,7 @@ public static Datum cast(Datum operandDatum, DataType target, @Nullable TimeZone case ANY: return DatumFactory.createAny(operandDatum); default: - throw new TajoRuntimeException(Errors.ResultCode.INVALID_CAST, operandDatum.type().name(), target.getType().name()); + throw new TajoRuntimeException(new InvalidCastException(operandDatum.type(), target.getType())); } } } diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/Float4Datum.java b/tajo-common/src/main/java/org/apache/tajo/datum/Float4Datum.java index 0fe598a2e6..04a735f9c9 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/Float4Datum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/Float4Datum.java @@ -22,6 +22,7 @@ import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.exception.InvalidCastException; import org.apache.tajo.exception.InvalidOperationException; +import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.util.MurmurHash; import org.apache.tajo.util.NumberUtil; import org.apache.tajo.util.datetime.DateTimeUtil; @@ -45,7 +46,7 @@ public Float4Datum(byte[] bytes) { } public boolean asBool() { - throw new InvalidCastException(); + throw new TajoRuntimeException(new InvalidCastException(TajoDataTypes.Type.FLOAT4, TajoDataTypes.Type.BOOLEAN)); } @Override @@ -70,7 +71,7 @@ public long asInt8() { @Override public byte asByte() { - throw new InvalidCastException(); + throw new TajoRuntimeException(new InvalidCastException(TajoDataTypes.Type.FLOAT4, TajoDataTypes.Type.BIT)); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/Int8Datum.java b/tajo-common/src/main/java/org/apache/tajo/datum/Int8Datum.java index 66c093ab05..55a65d91c2 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/Int8Datum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/Int8Datum.java @@ -22,6 +22,7 @@ import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.exception.InvalidCastException; import org.apache.tajo.exception.InvalidOperationException; +import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.util.MurmurHash; import org.apache.tajo.util.NumberUtil; import org.apache.tajo.util.datetime.DateTimeUtil; @@ -47,7 +48,7 @@ public Int8Datum(byte[] bytes) { @Override public boolean asBool() { - throw new InvalidCastException(); + throw new TajoRuntimeException(new InvalidCastException(TajoDataTypes.Type.INT8, TajoDataTypes.Type.BOOLEAN)); } @Override @@ -72,7 +73,7 @@ public long asInt8() { @Override public byte asByte() { - throw new InvalidCastException(); + throw new TajoRuntimeException(new InvalidCastException(TajoDataTypes.Type.INT8, TajoDataTypes.Type.BIT)); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/NullDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/NullDatum.java index 0007b52ea2..84dc998e87 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/NullDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/NullDatum.java @@ -20,6 +20,7 @@ import org.apache.tajo.common.TajoDataTypes.DataType; import org.apache.tajo.exception.InvalidCastException; +import org.apache.tajo.exception.TajoRuntimeException; import static org.apache.tajo.common.TajoDataTypes.Type; @@ -58,7 +59,7 @@ public boolean isNotNull() { @Override public boolean asBool() { - throw new InvalidCastException(Type.NULL_TYPE, Type.BOOLEAN); + throw new TajoRuntimeException(new InvalidCastException(Type.NULL_TYPE, Type.BOOLEAN)); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java index ffd6ca2538..2102080d9e 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TextDatum.java @@ -24,6 +24,7 @@ import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.exception.InvalidCastException; import org.apache.tajo.exception.InvalidOperationException; +import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.util.MurmurHash; import org.apache.tajo.util.StringUtils; @@ -51,12 +52,12 @@ public TextDatum(String string) { @Override public boolean asBool() { - throw new InvalidCastException(); + throw new TajoRuntimeException(new InvalidCastException(TajoDataTypes.Type.TEXT, TajoDataTypes.Type.BOOLEAN)); } @Override public byte asByte() { - throw new InvalidCastException(); + throw new TajoRuntimeException(new InvalidCastException(TajoDataTypes.Type.TEXT, TajoDataTypes.Type.BIT)); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java b/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java index 578665737b..62e5db12e6 100644 --- a/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java +++ b/tajo-common/src/main/java/org/apache/tajo/datum/TimeDatum.java @@ -22,6 +22,7 @@ import org.apache.tajo.common.TajoDataTypes; import org.apache.tajo.exception.InvalidCastException; import org.apache.tajo.exception.InvalidOperationException; +import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.util.Bytes; import org.apache.tajo.util.datetime.DateTimeConstants.DateStyle; import org.apache.tajo.util.datetime.DateTimeUtil; @@ -73,7 +74,7 @@ public String toString() { @Override public int asInt4() { - throw new InvalidCastException(); + throw new TajoRuntimeException(new InvalidCastException(TajoDataTypes.Type.TIME, TajoDataTypes.Type.INT4)); } @Override @@ -83,12 +84,12 @@ public long asInt8() { @Override public float asFloat4() { - throw new InvalidCastException(); + throw new TajoRuntimeException(new InvalidCastException(TajoDataTypes.Type.TIME, TajoDataTypes.Type.FLOAT4)); } @Override public double asFloat8() { - throw new InvalidCastException(); + throw new TajoRuntimeException(new InvalidCastException(TajoDataTypes.Type.TIME, TajoDataTypes.Type.FLOAT8)); } @Override diff --git a/tajo-common/src/main/java/org/apache/tajo/exception/InvalidCastException.java b/tajo-common/src/main/java/org/apache/tajo/exception/InvalidCastException.java index f5629240da..c49f91370c 100644 --- a/tajo-common/src/main/java/org/apache/tajo/exception/InvalidCastException.java +++ b/tajo-common/src/main/java/org/apache/tajo/exception/InvalidCastException.java @@ -19,18 +19,21 @@ package org.apache.tajo.exception; import org.apache.tajo.common.TajoDataTypes; +import org.apache.tajo.error.Errors; +import org.apache.tajo.rpc.protocolrecords.PrimitiveProtos.ReturnState; -public class InvalidCastException extends RuntimeException { +public class InvalidCastException extends TajoException { private static final long serialVersionUID = -7689027447969916148L; - public InvalidCastException() { + public InvalidCastException(ReturnState state) { + super(state); } public InvalidCastException(TajoDataTypes.DataType src, TajoDataTypes.DataType target) { - super(src.getType().name() + " value cannot be casted to " + target.getType().name()); + super(Errors.ResultCode.INVALID_CAST, src.getType().name(), target.getType().name()); } public InvalidCastException(TajoDataTypes.Type src, TajoDataTypes.Type target) { - super(src.name() + " value cannot be casted to " + target.name()); + super(Errors.ResultCode.INVALID_CAST, src.name(), target.name()); } } diff --git a/tajo-core/src/main/java/org/apache/tajo/engine/codegen/TajoGeneratorAdapter.java b/tajo-core/src/main/java/org/apache/tajo/engine/codegen/TajoGeneratorAdapter.java index 2fdafa0710..5c2c328b39 100644 --- a/tajo-core/src/main/java/org/apache/tajo/engine/codegen/TajoGeneratorAdapter.java +++ b/tajo-core/src/main/java/org/apache/tajo/engine/codegen/TajoGeneratorAdapter.java @@ -471,7 +471,7 @@ public void castInsn(TajoDataTypes.DataType srcType, TajoDataTypes.DataType targ case FLOAT8: methodvisitor.visitInsn(Opcodes.I2D); break; case TEXT: emitStringValueOfChar(); break; default: - throw new InvalidCastException(srcType, targetType); + throw new TajoRuntimeException(new InvalidCastException(srcType, targetType)); } } else { switch (targetRawType) { @@ -483,7 +483,7 @@ public void castInsn(TajoDataTypes.DataType srcType, TajoDataTypes.DataType targ case FLOAT4: emitParseFloat4(); break; case FLOAT8: emitParseFloat8(); break; case TEXT: break; - default: throw new InvalidCastException(srcType, targetType); + default: throw new TajoRuntimeException(new InvalidCastException(srcType, targetType)); } } break; @@ -500,7 +500,7 @@ public void castInsn(TajoDataTypes.DataType srcType, TajoDataTypes.DataType targ case FLOAT4: methodvisitor.visitInsn(Opcodes.I2F); break; case FLOAT8: methodvisitor.visitInsn(Opcodes.I2D); break; case TEXT: emitStringValueOfInt4(); break; - default: throw new InvalidCastException(srcType, targetType); + default: throw new TajoRuntimeException(new InvalidCastException(srcType, targetType)); } break; case INT8: @@ -513,7 +513,7 @@ public void castInsn(TajoDataTypes.DataType srcType, TajoDataTypes.DataType targ case FLOAT4: methodvisitor.visitInsn(Opcodes.L2F); break; case FLOAT8: methodvisitor.visitInsn(Opcodes.L2D); break; case TEXT: emitStringValueOfInt8(); break; - default: throw new InvalidCastException(srcType, targetType); + default: throw new TajoRuntimeException(new InvalidCastException(srcType, targetType)); } break; case FLOAT4: @@ -526,7 +526,7 @@ public void castInsn(TajoDataTypes.DataType srcType, TajoDataTypes.DataType targ case FLOAT4: return; case FLOAT8: methodvisitor.visitInsn(Opcodes.F2D); break; case TEXT: emitStringValueOfFloat4(); break; - default: throw new InvalidCastException(srcType, targetType); + default: throw new TajoRuntimeException(new InvalidCastException(srcType, targetType)); } break; case FLOAT8: @@ -539,7 +539,7 @@ public void castInsn(TajoDataTypes.DataType srcType, TajoDataTypes.DataType targ case FLOAT4: methodvisitor.visitInsn(Opcodes.D2F); break; case FLOAT8: return; case TEXT: emitStringValueOfFloat8(); break; - default: throw new InvalidCastException(srcType, targetType); + default: throw new TajoRuntimeException(new InvalidCastException(srcType, targetType)); } break; case TEXT: @@ -567,10 +567,10 @@ public void castInsn(TajoDataTypes.DataType srcType, TajoDataTypes.DataType targ "toJulianTime", "(L" + Type.getInternalName(String.class) + ";)J"); break; } - default: throw new InvalidCastException(srcType, targetType); + default: throw new TajoRuntimeException(new InvalidCastException(srcType, targetType)); } break; - default: throw new InvalidCastException(srcType, targetType); + default: throw new TajoRuntimeException(new InvalidCastException(srcType, targetType)); } } From 9d148285454293cc967e27fe9859f1993064fa75 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 16 Sep 2015 12:55:13 +0900 Subject: [PATCH 137/141] delete TestOrc --- .../org/apache/tajo/storage/orc/TestOrc.java | 176 ------------------ 1 file changed, 176 deletions(-) delete mode 100644 tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java deleted file mode 100644 index 5a039d0b81..0000000000 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/orc/TestOrc.java +++ /dev/null @@ -1,176 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.tajo.storage.orc; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hive.serde2.objectinspector.StructField; -import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; -import org.apache.tajo.catalog.Schema; -import org.apache.tajo.catalog.TableMeta; -import org.apache.tajo.common.TajoDataTypes; -import org.apache.tajo.conf.TajoConf; -import org.apache.tajo.datum.Int2Datum; -import org.apache.tajo.datum.Int4Datum; -import org.apache.tajo.datum.TextDatum; -import org.apache.tajo.datum.TimestampDatum; -import org.apache.tajo.storage.Tuple; -import org.apache.tajo.storage.VTuple; -import org.apache.tajo.storage.fragment.FileFragment; -import org.apache.tajo.storage.fragment.Fragment; -import org.apache.tajo.storage.orc.objectinspector.ObjectInspectorFactory; -import org.apache.tajo.util.KeyValueSet; -import org.apache.tajo.util.datetime.DateTimeUtil; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - -import static org.junit.Assert.*; - -import java.io.IOException; -import java.net.URL; -import java.util.List; - -public class TestOrc { - private ORCScanner orcScanner; - - private static Configuration conf = new TajoConf(); - private static FileSystem fs; - - static { - try { - fs = FileSystem.getLocal(conf); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public static Path getResourcePath(String path, String suffix) { - URL resultBaseURL = ClassLoader.getSystemResource(path); - return new Path(resultBaseURL.toString(), suffix); - } - - private static FileFragment getFileFragment(String fileName) throws IOException { - Path tablePath = new Path(getResourcePath("dataset", "."), fileName); - FileStatus status = fs.getFileStatus(tablePath); - return new FileFragment("table", tablePath, 0, status.getLen()); - } - - @Before - public void setup() throws IOException { - Schema schema = new Schema(); - schema.addColumn("userid", TajoDataTypes.Type.INT4); - schema.addColumn("movieid", TajoDataTypes.Type.INT4); - schema.addColumn("rating", TajoDataTypes.Type.INT2); - schema.addColumn("unixtimestamp", TajoDataTypes.Type.TEXT); - schema.addColumn("faketime", TajoDataTypes.Type.TIMESTAMP); - - TableMeta meta = new TableMeta("ORC", new KeyValueSet()); - - Fragment fragment = getFileFragment("u_data_20.orc"); - - orcScanner = new ORCScanner(conf, schema, meta, fragment); - - orcScanner.init(); - } - - @Test - public void testReadTuple() { - try { - Tuple tuple = orcScanner.next(); - - assertEquals(tuple.getInt4(0), 196); - assertEquals(tuple.getInt4(1), 242); - assertEquals(tuple.getInt2(2), 3); - assertEquals(tuple.getText(3), "881250949"); - - // Timestamp test - TimestampDatum timestamp = (TimestampDatum)tuple.asDatum(4); - - assertEquals(timestamp.getYear(), 2008); - assertEquals(timestamp.getMonthOfYear(), 12); - assertEquals(timestamp.getDayOfMonth(), 12); - } catch (IOException e) { - e.printStackTrace(); - } - } - - @Test - public void testWrite() { - Schema schema = new Schema(); - schema.addColumn("movieid", TajoDataTypes.Type.INT4); - schema.addColumn("rating", TajoDataTypes.Type.INT2); - schema.addColumn("comment", TajoDataTypes.Type.TEXT); - schema.addColumn("showtime", TajoDataTypes.Type.TIMESTAMP); - - StructObjectInspector structOI = ObjectInspectorFactory.buildStructObjectInspector(schema); - List fieldList = structOI.getAllStructFieldRefs(); - StructField midField = fieldList.get(0); - - assertEquals("movieid", midField.getFieldName()); - - Path writePath = new Path(getResourcePath("dataset", "."), "temp_test.orc"); - - try { - if (fs.exists(writePath)) { - fs.delete(writePath); - } - - TableMeta meta = new TableMeta("ORC", new KeyValueSet()); - - ORCAppender appender = new ORCAppender(conf, null, schema, meta, writePath); - - appender.init(); - - Tuple tuple = new VTuple(schema.size()); - tuple.put(0, new Int4Datum(100)); - tuple.put(1, new Int2Datum((short)7)); - tuple.put(2, new TextDatum("good")); - tuple.put(3, new TimestampDatum(DateTimeUtil.javaTimeToJulianTime(System.currentTimeMillis()))); - - appender.addTuple(tuple); - - appender.close(); - - Fragment fragment = getFileFragment("temp_test.orc"); - ORCScanner orcScanner = new ORCScanner(conf, schema, meta, fragment); - orcScanner.init(); - - tuple = orcScanner.next(); - - assertEquals(100, tuple.getInt4(0)); - - orcScanner.close(); - - } catch (IOException e) { - e.printStackTrace(); - } - } - - @After - public void end() { - try { - orcScanner.close(); - } catch (IOException e) { - e.printStackTrace(); - } - } -} \ No newline at end of file From 404c46fe6f53020dccc85ee4f310976c26dc7db4 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 16 Sep 2015 13:57:28 +0900 Subject: [PATCH 138/141] Test code changed by exception refining --- .../java/org/apache/tajo/exception/ErrorMessages.java | 1 + .../test/java/org/apache/tajo/datum/TestDateDatum.java | 6 +++--- .../test/java/org/apache/tajo/datum/TestTimeDatum.java | 8 ++++---- .../java/org/apache/tajo/datum/TestTimestampDatum.java | 8 ++++---- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tajo-common/src/main/java/org/apache/tajo/exception/ErrorMessages.java b/tajo-common/src/main/java/org/apache/tajo/exception/ErrorMessages.java index a3f18e3880..72c01a1e59 100644 --- a/tajo-common/src/main/java/org/apache/tajo/exception/ErrorMessages.java +++ b/tajo-common/src/main/java/org/apache/tajo/exception/ErrorMessages.java @@ -39,6 +39,7 @@ public class ErrorMessages { ADD_MESSAGE(NOT_IMPLEMENTED, "not implemented feature: %s", 1); ADD_MESSAGE(FEATURE_NOT_SUPPORTED, "unsupported feature: %s", 1); ADD_MESSAGE(INVALID_RPC_CALL, "invalid RPC Call: %s", 1); + ADD_MESSAGE(INVALID_CAST, "%s value cannot be casted to %s", 2); // Query Management and Scheduler ADD_MESSAGE(QUERY_FAILED, "query has been failed due to %s", 1); diff --git a/tajo-common/src/test/java/org/apache/tajo/datum/TestDateDatum.java b/tajo-common/src/test/java/org/apache/tajo/datum/TestDateDatum.java index 41b4dcae47..2b787f5a72 100644 --- a/tajo-common/src/test/java/org/apache/tajo/datum/TestDateDatum.java +++ b/tajo-common/src/test/java/org/apache/tajo/datum/TestDateDatum.java @@ -19,7 +19,7 @@ package org.apache.tajo.datum; import org.apache.tajo.common.TajoDataTypes.Type; -import org.apache.tajo.exception.InvalidCastException; +import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.json.CommonGsonHelper; import org.junit.Test; @@ -49,13 +49,13 @@ public final void testAsInt8() { assertEquals(d, copy); } - @Test(expected = InvalidCastException.class) + @Test(expected = TajoRuntimeException.class) public final void testAsFloat4() { Datum d = DatumFactory.createDate(DATE); d.asFloat4(); } - @Test(expected = InvalidCastException.class) + @Test(expected = TajoRuntimeException.class) public final void testAsFloat8() { Datum d = DatumFactory.createDate(DATE); d.asFloat8(); diff --git a/tajo-common/src/test/java/org/apache/tajo/datum/TestTimeDatum.java b/tajo-common/src/test/java/org/apache/tajo/datum/TestTimeDatum.java index ea641ecf7b..457ff41e01 100644 --- a/tajo-common/src/test/java/org/apache/tajo/datum/TestTimeDatum.java +++ b/tajo-common/src/test/java/org/apache/tajo/datum/TestTimeDatum.java @@ -20,7 +20,7 @@ import org.apache.tajo.common.TajoDataTypes.Type; import org.apache.tajo.common.TajoDataTypes.DataType; -import org.apache.tajo.exception.InvalidCastException; +import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.json.CommonGsonHelper; import org.junit.Test; @@ -36,7 +36,7 @@ public final void testType() { assertEquals(Type.TIME, d.type()); } - @Test(expected = InvalidCastException.class) + @Test(expected = TajoRuntimeException.class) public final void testAsInt4() { Datum d = DatumFactory.createTime(TIME); Datum copy = DatumFactory.createTime(d.asInt4()); @@ -50,13 +50,13 @@ public final void testAsInt8() { assertEquals(d, copy); } - @Test(expected = InvalidCastException.class) + @Test(expected = TajoRuntimeException.class) public final void testAsFloat4() { Datum d = DatumFactory.createTime(TIME); d.asFloat4(); } - @Test(expected = InvalidCastException.class) + @Test(expected = TajoRuntimeException.class) public final void testAsFloat8() { Datum d = DatumFactory.createTime(TIME); d.asFloat8(); diff --git a/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java b/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java index 7cb31231c3..dc8a8819bf 100644 --- a/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java +++ b/tajo-common/src/test/java/org/apache/tajo/datum/TestTimestampDatum.java @@ -19,7 +19,7 @@ package org.apache.tajo.datum; import org.apache.tajo.common.TajoDataTypes.Type; -import org.apache.tajo.exception.InvalidCastException; +import org.apache.tajo.exception.TajoRuntimeException; import org.apache.tajo.json.CommonGsonHelper; import org.apache.tajo.util.datetime.DateTimeUtil; import org.junit.BeforeClass; @@ -50,7 +50,7 @@ public final void testType() { assertEquals(Type.TIMESTAMP, d.type()); } - @Test(expected = InvalidCastException.class) + @Test(expected = TajoRuntimeException.class) public final void testAsInt4() { Datum d = DatumFactory.createTimestmpDatumWithUnixTime(unixtime); d.asInt4(); @@ -63,13 +63,13 @@ public final void testAsInt8() { assertEquals(DateTimeUtil.javaTimeToJulianTime(javaTime), d.asInt8()); } - @Test(expected = InvalidCastException.class) + @Test(expected = TajoRuntimeException.class) public final void testAsFloat4() { Datum d = DatumFactory.createTimestmpDatumWithUnixTime(unixtime); d.asFloat4(); } - @Test(expected = InvalidCastException.class) + @Test(expected = TajoRuntimeException.class) public final void testAsFloat8() { int instance = 1386577582; Datum d = DatumFactory.createTimestmpDatumWithUnixTime(instance); From 39f6fc16a38fbb606db5767902c29dcd56f32ddb Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 16 Sep 2015 14:46:47 +0900 Subject: [PATCH 139/141] Error handling added --- .../java/org/apache/tajo/storage/orc/ORCScanner.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java index b197d59eab..d157f7aea2 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java @@ -172,6 +172,11 @@ public void init() throws IOException { @Override public Tuple next() throws IOException { + // EOF + if (batchSize == -1) { + return null; + } + if (currentPosInBatch == batchSize) { getNextBatch(); @@ -296,6 +301,10 @@ private Datum createValueDatum(Vector vector, TajoDataTypes.DataType type) { private void getNextBatch() throws IOException { batchSize = recordReader.nextBatch(); + // end of file + if (batchSize == -1) + return; + for (int i=0; i Date: Wed, 16 Sep 2015 15:46:18 +0900 Subject: [PATCH 140/141] orc scanner init logic changed --- .../main/java/org/apache/tajo/storage/orc/ORCScanner.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java index d157f7aea2..8a9d623e88 100644 --- a/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java +++ b/tajo-storage/tajo-storage-hdfs/src/main/java/org/apache/tajo/storage/orc/ORCScanner.java @@ -166,17 +166,10 @@ public void init() throws IOException { LOG.debug("file fragment { path: " + fragment.getPath() + ", start offset: " + fragment.getStartKey() + ", length: " + fragment.getLength() + "}"); - - getNextBatch(); } @Override public Tuple next() throws IOException { - // EOF - if (batchSize == -1) { - return null; - } - if (currentPosInBatch == batchSize) { getNextBatch(); From e201b74c67144bf1226349782b842640112bc5e3 Mon Sep 17 00:00:00 2001 From: Jongyoung Park Date: Wed, 16 Sep 2015 16:58:36 +0900 Subject: [PATCH 141/141] path to local variable --- .../src/test/java/org/apache/tajo/storage/TestStorages.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java index a401c51b51..02472eb0ce 100644 --- a/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java +++ b/tajo-storage/tajo-storage-hdfs/src/test/java/org/apache/tajo/storage/TestStorages.java @@ -61,7 +61,6 @@ @RunWith(Parameterized.class) public class TestStorages { private TajoConf conf; - private static String TEST_PATH = "target/test-data/TestStorages"; private static String TEST_PROJECTION_AVRO_SCHEMA = "{\n" + @@ -120,6 +119,8 @@ public class TestStorages { public TestStorages(String type, boolean splitable, boolean statsable, boolean seekable, boolean internalType) throws IOException { + final String TEST_PATH = "target/test-data/TestStorages"; + this.storeType = type; this.splitable = splitable; this.statsable = statsable;