Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[GOBBLIN-573] Add option to use finer level granularity at the hour l…
…evel for TimeAwareDatasetfinder Closes #2438 from sv2000/hourly
- Loading branch information
Showing
3 changed files
with
251 additions
and
34 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
189 changes: 189 additions & 0 deletions
189
...t/java/org/apache/gobblin/data/management/copy/TimeAwareRecursiveCopyableDatasetTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.gobblin.data.management.copy; | ||
|
||
import java.io.IOException; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Properties; | ||
import java.util.Set; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.FileStatus; | ||
import org.apache.hadoop.fs.FileSystem; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.hadoop.fs.PathFilter; | ||
import org.joda.time.LocalDateTime; | ||
import org.joda.time.Period; | ||
import org.joda.time.format.DateTimeFormat; | ||
import org.joda.time.format.DateTimeFormatter; | ||
import org.joda.time.format.PeriodFormatter; | ||
import org.joda.time.format.PeriodFormatterBuilder; | ||
import org.testng.Assert; | ||
import org.testng.annotations.AfterClass; | ||
import org.testng.annotations.BeforeClass; | ||
import org.testng.annotations.Test; | ||
|
||
import org.apache.gobblin.util.PathUtils; | ||
import org.apache.gobblin.util.filters.HiddenFilter; | ||
|
||
public class TimeAwareRecursiveCopyableDatasetTest { | ||
private FileSystem fs; | ||
private Path baseDir1; | ||
private Path baseDir2; | ||
|
||
private static final String NUM_LOOKBACK_DAYS_STR = "2d"; | ||
private static final Integer NUM_LOOKBACK_DAYS = 2; | ||
private static final String NUM_LOOKBACK_HOURS_STR = "4h"; | ||
private static final Integer NUM_LOOKBACK_HOURS = 4; | ||
private static final Integer MAX_NUM_DAILY_DIRS = 4; | ||
private static final Integer MAX_NUM_HOURLY_DIRS = 48; | ||
private static final String NUM_LOOKBACK_DAYS_HOURS_STR = "1d1h"; | ||
private static final Integer NUM_DAYS_HOURS_DIRS = 25; | ||
|
||
@BeforeClass | ||
public void setUp() throws IOException { | ||
Assert.assertTrue(NUM_LOOKBACK_DAYS < MAX_NUM_DAILY_DIRS); | ||
Assert.assertTrue(NUM_LOOKBACK_HOURS < MAX_NUM_HOURLY_DIRS); | ||
|
||
this.fs = FileSystem.getLocal(new Configuration()); | ||
|
||
baseDir1 = new Path("/tmp/src/ds1/hourly"); | ||
if (fs.exists(baseDir1)) { | ||
fs.delete(baseDir1, true); | ||
} | ||
fs.mkdirs(baseDir1); | ||
|
||
baseDir2 = new Path("/tmp/src/ds1/daily"); | ||
if (fs.exists(baseDir2)) { | ||
fs.delete(baseDir2, true); | ||
} | ||
fs.mkdirs(baseDir2); | ||
PeriodFormatter formatter = new PeriodFormatterBuilder().appendDays().appendSuffix("d").appendHours().appendSuffix("h").toFormatter(); | ||
Period period = formatter.parsePeriod(NUM_LOOKBACK_DAYS_HOURS_STR); | ||
} | ||
|
||
@Test | ||
public void testGetFilesAtPath() throws IOException { | ||
String datePattern = "yyyy/MM/dd/HH"; | ||
DateTimeFormatter formatter = DateTimeFormat.forPattern(datePattern); | ||
|
||
LocalDateTime endDate = LocalDateTime.now(); | ||
|
||
Set<String> candidateFiles = new HashSet<>(); | ||
for (int i = 0; i < MAX_NUM_HOURLY_DIRS; i++) { | ||
String startDate = endDate.minusHours(i).toString(formatter); | ||
Path subDirPath = new Path(baseDir1, new Path(startDate)); | ||
fs.mkdirs(subDirPath); | ||
Path filePath = new Path(subDirPath, i + ".avro"); | ||
fs.create(filePath); | ||
if (i < (NUM_LOOKBACK_HOURS + 1)) { | ||
candidateFiles.add(filePath.toString()); | ||
} | ||
} | ||
|
||
//Lookback time = "4h" | ||
Properties properties = new Properties(); | ||
properties.setProperty(TimeAwareRecursiveCopyableDataset.LOOKBACK_TIME_KEY, NUM_LOOKBACK_HOURS_STR); | ||
properties.setProperty(TimeAwareRecursiveCopyableDataset.DATE_PATTERN_KEY, "yyyy/MM/dd/HH"); | ||
|
||
PathFilter pathFilter = new HiddenFilter(); | ||
TimeAwareRecursiveCopyableDataset dataset = new TimeAwareRecursiveCopyableDataset(fs, baseDir1, properties, | ||
new Path("/tmp/src/*/hourly")); | ||
List<FileStatus> fileStatusList = dataset.getFilesAtPath(fs, baseDir1, pathFilter); | ||
|
||
Assert.assertEquals(fileStatusList.size(), NUM_LOOKBACK_HOURS + 1); | ||
|
||
for (FileStatus fileStatus: fileStatusList) { | ||
Assert.assertTrue(candidateFiles.contains(PathUtils.getPathWithoutSchemeAndAuthority(fileStatus.getPath()).toString())); | ||
} | ||
|
||
//Lookback time = "1d1h" | ||
properties = new Properties(); | ||
properties.setProperty(TimeAwareRecursiveCopyableDataset.LOOKBACK_TIME_KEY, NUM_LOOKBACK_DAYS_HOURS_STR); | ||
properties.setProperty(TimeAwareRecursiveCopyableDataset.DATE_PATTERN_KEY, "yyyy/MM/dd/HH"); | ||
dataset = new TimeAwareRecursiveCopyableDataset(fs, baseDir1, properties, | ||
new Path("/tmp/src/*/hourly")); | ||
fileStatusList = dataset.getFilesAtPath(fs, baseDir1, pathFilter); | ||
candidateFiles = new HashSet<>(); | ||
datePattern = "yyyy/MM/dd/HH"; | ||
formatter = DateTimeFormat.forPattern(datePattern); | ||
|
||
for (int i = 0; i < MAX_NUM_HOURLY_DIRS; i++) { | ||
String startDate = endDate.minusHours(i).toString(formatter); | ||
Path subDirPath = new Path(baseDir1, new Path(startDate)); | ||
Path filePath = new Path(subDirPath, i + ".avro"); | ||
if (i < NUM_DAYS_HOURS_DIRS + 1) { | ||
candidateFiles.add(filePath.toString()); | ||
} | ||
} | ||
|
||
Assert.assertEquals(fileStatusList.size(), NUM_DAYS_HOURS_DIRS + 1); | ||
for (FileStatus fileStatus: fileStatusList) { | ||
Assert.assertTrue(candidateFiles.contains(PathUtils.getPathWithoutSchemeAndAuthority(fileStatus.getPath()).toString())); | ||
} | ||
|
||
//Lookback time = "2d" | ||
datePattern = "yyyy/MM/dd"; | ||
formatter = DateTimeFormat.forPattern(datePattern); | ||
endDate = LocalDateTime.now(); | ||
|
||
candidateFiles = new HashSet<>(); | ||
for (int i = 0; i < MAX_NUM_DAILY_DIRS; i++) { | ||
String startDate = endDate.minusDays(i).toString(formatter); | ||
Path subDirPath = new Path(baseDir2, new Path(startDate)); | ||
fs.mkdirs(subDirPath); | ||
Path filePath = new Path(subDirPath, i + ".avro"); | ||
fs.create(filePath); | ||
if (i < (NUM_LOOKBACK_DAYS + 1)) { | ||
candidateFiles.add(filePath.toString()); | ||
} | ||
} | ||
|
||
properties = new Properties(); | ||
properties.setProperty(TimeAwareRecursiveCopyableDataset.LOOKBACK_TIME_KEY, NUM_LOOKBACK_DAYS_STR); | ||
properties.setProperty(TimeAwareRecursiveCopyableDataset.DATE_PATTERN_KEY, "yyyy/MM/dd"); | ||
|
||
dataset = new TimeAwareRecursiveCopyableDataset(fs, baseDir2, properties, | ||
new Path("/tmp/src/*/daily")); | ||
fileStatusList = dataset.getFilesAtPath(fs, baseDir2, pathFilter); | ||
|
||
Assert.assertEquals(fileStatusList.size(), NUM_LOOKBACK_DAYS + 1); | ||
for (FileStatus fileStatus: fileStatusList) { | ||
Assert.assertTrue(candidateFiles.contains(PathUtils.getPathWithoutSchemeAndAuthority(fileStatus.getPath()).toString())); | ||
} | ||
} | ||
|
||
@Test (expectedExceptions = AssertionError.class) | ||
public void testInstantiationError() { | ||
//Daily directories, but look back time has days and hours. We should expect an assertion error. | ||
Properties properties = new Properties(); | ||
properties.setProperty(TimeAwareRecursiveCopyableDataset.LOOKBACK_TIME_KEY, NUM_LOOKBACK_DAYS_HOURS_STR); | ||
properties.setProperty(TimeAwareRecursiveCopyableDataset.DATE_PATTERN_KEY, "yyyy/MM/dd"); | ||
|
||
TimeAwareRecursiveCopyableDataset dataset = new TimeAwareRecursiveCopyableDataset(fs, baseDir2, properties, | ||
new Path("/tmp/src/*/daily")); | ||
} | ||
|
||
@AfterClass | ||
public void clean() throws IOException { | ||
//Delete tmp directories | ||
this.fs.delete(baseDir1, true); | ||
this.fs.delete(baseDir2, true); | ||
} | ||
} |