From d58a8fec82ebe1b1b2c80b22788d0b8c6b764fc6 Mon Sep 17 00:00:00 2001 From: RJ Nowling Date: Tue, 25 Aug 2015 22:20:20 -0500 Subject: [PATCH] BIGTOP-1986. Extract location dataset from BigPetStore data generator --- .../bigtop-location-data/README.md | 50 +++++++++++++++ .../bigtop-location-data/build.gradle | 63 +++++++++++++++++++ .../bigtop-location-data/settings.gradle | 16 +++++ .../datagenerators/locations/Location.java} | 8 +-- .../locations/LocationConstants.java | 25 ++++++++ .../locations/LocationReader.java} | 41 +++++------- .../ACS_12_5YR_S1903/ACS_12_5YR_S1903.txt | 0 .../ACS_12_5YR_S1903_metadata.csv | 0 .../ACS_12_5YR_S1903_with_ann.csv | 0 .../resources/input_data/population_data.csv | 0 .../src/main/resources/input_data/zips.csv | 0 11 files changed, 173 insertions(+), 30 deletions(-) create mode 100644 bigtop-data-generators/bigtop-location-data/README.md create mode 100644 bigtop-data-generators/bigtop-location-data/build.gradle create mode 100644 bigtop-data-generators/bigtop-location-data/settings.gradle rename bigtop-data-generators/{bigpetstore-data-generator/src/main/java/org/apache/bigtop/datagenerators/bigpetstore/datamodels/inputs/ZipcodeRecord.java => bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/Location.java} (90%) create mode 100644 bigtop-data-generators/bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/LocationConstants.java rename bigtop-data-generators/{bigpetstore-data-generator/src/main/java/org/apache/bigtop/datagenerators/bigpetstore/datareaders/ZipcodeReader.java => bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/LocationReader.java} (82%) rename bigtop-data-generators/{bigpetstore-data-generator => bigtop-location-data}/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903.txt (100%) rename bigtop-data-generators/{bigpetstore-data-generator => bigtop-location-data}/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903_metadata.csv (100%) rename bigtop-data-generators/{bigpetstore-data-generator => bigtop-location-data}/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903_with_ann.csv (100%) rename bigtop-data-generators/{bigpetstore-data-generator => bigtop-location-data}/src/main/resources/input_data/population_data.csv (100%) rename bigtop-data-generators/{bigpetstore-data-generator => bigtop-location-data}/src/main/resources/input_data/zips.csv (100%) diff --git a/bigtop-data-generators/bigtop-location-data/README.md b/bigtop-data-generators/bigtop-location-data/README.md new file mode 100644 index 0000000000..9aff06f9c0 --- /dev/null +++ b/bigtop-data-generators/bigtop-location-data/README.md @@ -0,0 +1,50 @@ + +BigTop Location Data +==================== + +U.S. zipcode data including GPS coordinates, median household incomes, +and population sizes from the U.S. Census along with a reader and +data model. + +Building and Testing +-------------------- +We use the Gradle build system for the BPS data generator so you'll need +to install Gradle on your system. +Once that's done, you can use gradle to run the included unit tests +and build the data generator jar. + +To build: + + $ gradle build + +This will create several directories and a jar located at: + + build/libs/bigtop-location-data-1.1.0-SNAPSHOT.jar + +Building automatically runs the included unit tests. If you would prefer +to just run the unit tests, you can do so by: + + $ gradle test + +To clean up the build files, run: + + $ gradle clean + +To install a jar into your local maven repository: + + $ gradle install diff --git a/bigtop-data-generators/bigtop-location-data/build.gradle b/bigtop-data-generators/bigtop-location-data/build.gradle new file mode 100644 index 0000000000..9eb91e39e7 --- /dev/null +++ b/bigtop-data-generators/bigtop-location-data/build.gradle @@ -0,0 +1,63 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +apply plugin: 'eclipse' +apply plugin: 'groovy' +apply plugin: 'java' +apply plugin: 'maven' +group = 'org.apache.bigtop' +version = '1.1.0-SNAPSHOT' + +jar { + + from { + configurations.runtime.collect { + it.isDirectory() ? it : zipTree(it) + } + } + + manifest { + attributes 'Title': 'BigTop Samplers', 'Version': version + } +} + +repositories { + mavenLocal() + mavenCentral() +} + +test { + // show standard out and error on console + testLogging.showStandardStreams = true + + // listen to events in the test execution lifecycle + beforeTest { descriptor -> + logger.lifecycle("Running test: " + descriptor) + } + + // listen to standard out and standard error of the test JVM(s) + onOutput { descriptor, event -> + logger.lifecycle("Test: " + descriptor + " produced standard out/err: " + event.message ) + } + +} + +dependencies { + compile 'com.google.guava:guava:18.0' + + compile 'org.apache.commons:commons-lang3:3.4' + + testCompile 'junit:junit:4.+' +} diff --git a/bigtop-data-generators/bigtop-location-data/settings.gradle b/bigtop-data-generators/bigtop-location-data/settings.gradle new file mode 100644 index 0000000000..f66bfdbc26 --- /dev/null +++ b/bigtop-data-generators/bigtop-location-data/settings.gradle @@ -0,0 +1,16 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +rootProject.name = "bigtop-location-data" \ No newline at end of file diff --git a/bigtop-data-generators/bigpetstore-data-generator/src/main/java/org/apache/bigtop/datagenerators/bigpetstore/datamodels/inputs/ZipcodeRecord.java b/bigtop-data-generators/bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/Location.java similarity index 90% rename from bigtop-data-generators/bigpetstore-data-generator/src/main/java/org/apache/bigtop/datagenerators/bigpetstore/datamodels/inputs/ZipcodeRecord.java rename to bigtop-data-generators/bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/Location.java index e5eeb600f7..62afc0a883 100644 --- a/bigtop-data-generators/bigpetstore-data-generator/src/main/java/org/apache/bigtop/datagenerators/bigpetstore/datamodels/inputs/ZipcodeRecord.java +++ b/bigtop-data-generators/bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/Location.java @@ -13,13 +13,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.bigtop.datagenerators.bigpetstore.datamodels.inputs; +package org.apache.bigtop.datagenerators.locations; import java.io.Serializable; import org.apache.commons.lang3.tuple.Pair; -public class ZipcodeRecord implements Serializable +public class Location implements Serializable { private static final long serialVersionUID = 1769986686070108470L; @@ -30,7 +30,7 @@ public class ZipcodeRecord implements Serializable final double medianHouseholdIncome; final long population; - public ZipcodeRecord(String zipcode, Pair coordinates, + public Location(String zipcode, Pair coordinates, String city, String state, double medianHouseholdIncome, long population) { this.city = city; @@ -61,7 +61,7 @@ public long getPopulation() return population; } - public double distance(ZipcodeRecord other) + public double distance(Location other) { if(other.getZipcode().equals(zipcode)) return 0.0; diff --git a/bigtop-data-generators/bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/LocationConstants.java b/bigtop-data-generators/bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/LocationConstants.java new file mode 100644 index 0000000000..4140bfd4a3 --- /dev/null +++ b/bigtop-data-generators/bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/LocationConstants.java @@ -0,0 +1,25 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.bigtop.datagenerators.locations; + +import java.io.File; + +public class LocationConstants +{ + public static final File COORDINATES_FILE = new File("zips.csv"); + public static final File INCOMES_FILE = new File("ACS_12_5YR_S1903/ACS_12_5YR_S1903_with_ann.csv"); + public static final File POPULATION_FILE = new File("population_data.csv"); +} diff --git a/bigtop-data-generators/bigpetstore-data-generator/src/main/java/org/apache/bigtop/datagenerators/bigpetstore/datareaders/ZipcodeReader.java b/bigtop-data-generators/bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/LocationReader.java similarity index 82% rename from bigtop-data-generators/bigpetstore-data-generator/src/main/java/org/apache/bigtop/datagenerators/bigpetstore/datareaders/ZipcodeReader.java rename to bigtop-data-generators/bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/LocationReader.java index 2478c8e1b4..5be4d59d8e 100644 --- a/bigtop-data-generators/bigpetstore-data-generator/src/main/java/org/apache/bigtop/datagenerators/bigpetstore/datareaders/ZipcodeReader.java +++ b/bigtop-data-generators/bigtop-location-data/src/main/java/org/apache/bigtop/datagenerators/locations/LocationReader.java @@ -13,8 +13,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.bigtop.datagenerators.bigpetstore.datareaders; +package org.apache.bigtop.datagenerators.locations; +import java.io.BufferedInputStream; +import java.io.File; import java.io.FileNotFoundException; import java.io.InputStream; import java.util.HashSet; @@ -24,14 +26,13 @@ import java.util.Set; import java.util.Vector; -import org.apache.bigtop.datagenerators.bigpetstore.datamodels.inputs.ZipcodeRecord; import org.apache.commons.lang3.tuple.Pair; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; -public class ZipcodeReader +public class LocationReader { private static class ZipcodeLocationRecord { @@ -47,24 +48,11 @@ public ZipcodeLocationRecord(Pair coordinates, this.state = state; } } - - InputStream zipcodeIncomesFile = null; - InputStream zipcodePopulationFile = null; - InputStream zipcodeCoordinatesFile = null; - - public void setIncomesFile(InputStream path) - { - this.zipcodeIncomesFile = path; - } - - public void setPopulationFile(InputStream path) - { - this.zipcodePopulationFile = path; - } - - public void setCoordinatesFile(InputStream path) + + private InputStream getResource(File filename) { - this.zipcodeCoordinatesFile = path; + InputStream stream = getClass().getResourceAsStream("/input_data/" + filename); + return new BufferedInputStream(stream); } private ImmutableMap readIncomeData(InputStream path) throws FileNotFoundException @@ -166,20 +154,21 @@ private ImmutableMap readCoordinates(InputStream return ImmutableMap.copyOf(entries); } - public ImmutableList readData() throws FileNotFoundException + public ImmutableList readData() throws FileNotFoundException { - ImmutableMap incomes = readIncomeData(this.zipcodeIncomesFile); - ImmutableMap populations = readPopulationData(this.zipcodePopulationFile); - ImmutableMap coordinates = readCoordinates(this.zipcodeCoordinatesFile); + + ImmutableMap incomes = readIncomeData(getResource(LocationConstants.INCOMES_FILE)); + ImmutableMap populations = readPopulationData(getResource(LocationConstants.POPULATION_FILE)); + ImmutableMap coordinates = readCoordinates(getResource(LocationConstants.COORDINATES_FILE)); Set zipcodeSubset = new HashSet(incomes.keySet()); zipcodeSubset.retainAll(populations.keySet()); zipcodeSubset.retainAll(coordinates.keySet()); - List table = new Vector(); + List table = new Vector(); for(String zipcode : zipcodeSubset) { - ZipcodeRecord record = new ZipcodeRecord(zipcode, + Location record = new Location(zipcode, coordinates.get(zipcode).coordinates, coordinates.get(zipcode).city, coordinates.get(zipcode).state, diff --git a/bigtop-data-generators/bigpetstore-data-generator/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903.txt b/bigtop-data-generators/bigtop-location-data/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903.txt similarity index 100% rename from bigtop-data-generators/bigpetstore-data-generator/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903.txt rename to bigtop-data-generators/bigtop-location-data/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903.txt diff --git a/bigtop-data-generators/bigpetstore-data-generator/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903_metadata.csv b/bigtop-data-generators/bigtop-location-data/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903_metadata.csv similarity index 100% rename from bigtop-data-generators/bigpetstore-data-generator/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903_metadata.csv rename to bigtop-data-generators/bigtop-location-data/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903_metadata.csv diff --git a/bigtop-data-generators/bigpetstore-data-generator/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903_with_ann.csv b/bigtop-data-generators/bigtop-location-data/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903_with_ann.csv similarity index 100% rename from bigtop-data-generators/bigpetstore-data-generator/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903_with_ann.csv rename to bigtop-data-generators/bigtop-location-data/src/main/resources/input_data/ACS_12_5YR_S1903/ACS_12_5YR_S1903_with_ann.csv diff --git a/bigtop-data-generators/bigpetstore-data-generator/src/main/resources/input_data/population_data.csv b/bigtop-data-generators/bigtop-location-data/src/main/resources/input_data/population_data.csv similarity index 100% rename from bigtop-data-generators/bigpetstore-data-generator/src/main/resources/input_data/population_data.csv rename to bigtop-data-generators/bigtop-location-data/src/main/resources/input_data/population_data.csv diff --git a/bigtop-data-generators/bigpetstore-data-generator/src/main/resources/input_data/zips.csv b/bigtop-data-generators/bigtop-location-data/src/main/resources/input_data/zips.csv similarity index 100% rename from bigtop-data-generators/bigpetstore-data-generator/src/main/resources/input_data/zips.csv rename to bigtop-data-generators/bigtop-location-data/src/main/resources/input_data/zips.csv