Merge 90ad31b into 9ee3595

bigdatagenomics · Apr 10, 2018 · cdf5db6 · cdf5db6
2 parents 9ee3595 + 90ad31b
commit cdf5db6
Show file tree

Hide file tree

Showing 205 changed files with 40,041 additions and 7,811 deletions.
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,7 @@ syntax: glob
 .project
 .manager
 .scala_dependencies
+.iml
 
 # idea
 *.idea

diff --git a/ADAMKryoRegistrator.scala b/ADAMKryoRegistrator.scala
diff --git a/GnocchiFunSuite.scala b/GnocchiFunSuite.scala
diff --git a/Gnocchi_README.md b/Gnocchi_README.md
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -1,13 +1,13 @@
-# gnocchi
+# Gnocchi
 
 [![Coverage Status](https://coveralls.io/repos/github/bigdatagenomics/gnocchi/badge.svg?branch=master)](https://coveralls.io/github/bigdatagenomics/gnocchi?branch=master)
 
-Genotype-phenotype analysis using the [ADAM](https://github.com/bigdatagenomics/adam) genomics analysis platform.
-This is work-in-progress. Currently, we implement a simple case/control analysis using a Chi squared test.
+Statistical associations using the [ADAM](https://github.com/bigdatagenomics/adam) genomics analysis platform.
+The currently supported operations are Genome Wide Association using Linear and Logistic models with either Dominant or Additive assumptions.
 
 # Build
 
-To build, install [Maven](http://maven.apache.org). Then run:
+To build, install [Maven](http://maven.apache.org). Then (once in the gnocchi directory) run:
 
 ```
 mvn package
@@ -19,29 +19,38 @@ by setting the `MAVEN_OPTS` environment variable to `-Xmx2g -XX:MaxPermSize=1g`.
 
 # Run
 
-To run, you'll need to install Spark. If you are just evaluating locally, you can use
+Gnocchi is built on top of [Apache Spark](http://spark.apache.org). If you are just evaluating locally, you can use
 [a prebuilt Spark distribution](http://spark.apache.org/downloads.html). If you'd like to
 use a cluster, refer to Spark's [cluster overview](http://spark.apache.org/docs/latest/cluster-overview.html).
 
 Once Spark is installed, set the environment variable `SPARK_HOME` to point to the Spark
-installation root directory. Then, you can run `gnocchi` via `./bin/gnocchi-submit`.
+installation root directory. 
 
-We include test data. You can run with the test data by running:
+The target binaries are complied to the `bin/` directory. Add them to your path with 
 
 ```
-./bin/gnocchi-submit regressPhenotypes testData/sample.vcf testData/samplePhenotypes.csv testData/associations -saveAsText
+echo "export PATH=[GNOCCHI INSTALLATION DIR]/gnocchi/bin:\$PATH" >> $HOME/.bashrc
+source $HOME/.bashrc
+```
+
+You can then run `gnocchi` via `gnocchi-submit`, or open a shell using `gnocchi-shell`.
+
+Test data is included. You can run with the test data by running:
+
+```
+gnocchi-submit regressPhenotypes examples/testData/5snps10samples.vcf examples/testData/10samples5Phenotypes2covars.txt ADDITIVE_LINEAR ../associations -saveAsText -phenoName pheno1 -covar -covarFile examples/testData/10samples5Phenotypes2covars.txt -covarNames pheno4,pheno5 -sampleIDName SampleID
 ```
 
 ## Phenotype Input
 
-We accept phenotype inputs in a CSV format:
+Format your phenotypes files as CSV or tab-delimited text, and include a header. 
 
 ```
-Sample,Phenotype,Has Phenotype
-mySample,a phenotype,true
+SampleID    pheno1    pheno2
+00001       0.001     0.002
 ```
 
-The `has phenotype` column is binary true/false. See the test data for more descriptions.
+Note: phenotypes and covariates must be numerical. For nominal scale data (i.e. categorical data), binarize. For ordinal scale data, convert to integers. 
 
 # License
 

diff --git a/bin/gnocchi-shell b/bin/gnocchi-shell
@@ -0,0 +1,100 @@
+#!/usr/bin/env bash
+#
+# Licensed to Big Data Genomics (BDG) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The BDG licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+# Split args into Spark and GNOCCHI args
+DD=False  # DD is "double dash"
+PRE_DD=()
+POST_DD=()
+for ARG in "$@"; do
+  shift
+  if [[ $ARG == "--" ]]; then
+    DD=True
+    POST_DD=( "$@" )
+    break
+  fi
+  PRE_DD+=("$ARG")
+done
+
+if [[ $DD == True ]]; then
+  SPARK_ARGS=("${PRE_DD[@]}")
+  GNOCCHI_ARGS=("${POST_DD[@]}")
+else
+  SPARK_ARGS=()
+  GNOCCHI_ARGS=("${PRE_DD[@]}")
+fi
+
+# Figure out where GNOCCHI is installed
+SCRIPT_DIR="$(cd `dirname $0`/..; pwd)"
+
+# does the user have GNOCCHI_OPTS set? if yes, then warn
+if [[ -z $@ && -n "$GNOCCHI_OPTS" ]]; then
+    echo "WARNING: Passing Spark arguments via GNOCCHI_OPTS was recently removed."
+    echo "Run gnocchi-shell instead as gnocchi-shell <spark-args>"
+fi
+
+# Find GNOCCHI cli assembly jar
+GNOCCHI_CLI_JAR=
+if [ -d "$SCRIPT_DIR/repo" ]; then
+  ASSEMBLY_DIR="$SCRIPT_DIR/repo"
+else
+  ASSEMBLY_DIR="$SCRIPT_DIR/gnocchi-assembly/target"
+fi
+
+num_jars="$(ls -1 "$ASSEMBLY_DIR" | grep "^gnocchi-assembly_[0-9A-Za-z\.-]*\.jar$" | grep -v javadoc | grep -v sources | wc -l)"
+if [ "$num_jars" -eq "0" ]; then
+  echo "Failed to find GNOCCHI assembly in $ASSEMBLY_DIR." 1>&2
+  echo "You need to build GNOCCHI before running this program." 1>&2
+  exit 1
+fi
+
+ASSEMBLY_JARS="$(ls -1 "$ASSEMBLY_DIR" | grep "^gnocchi-assembly_[0-9A-Za-z\.-]*\.jar$" | grep -v javadoc | grep -v sources || true)"
+if [ "$num_jars" -gt "1" ]; then
+  echo "Found multiple GNOCCHI cli assembly jars in $ASSEMBLY_DIR:" 1>&2
+  echo "$ASSEMBLY_JARS" 1>&2
+  echo "Please remove all but one jar." 1>&2
+  exit 1
+fi
+
+GNOCCHI_CLI_JAR="${ASSEMBLY_DIR}/${ASSEMBLY_JARS}"
+
+if [ -z "$SPARK_HOME" ]; then
+  SPARK_SHELL=$(which spark-shell || echo)
+else
+  SPARK_SHELL="$SPARK_HOME"/bin/spark-shell
+fi
+if [ -z "$SPARK_SHELL" ]; then
+  echo "SPARK_HOME not set and spark-shell not on PATH; Aborting."
+  exit 1
+fi
+echo "Using SPARK_SHELL=$SPARK_SHELL"
+echo "Welcome to"
+
+ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+cat "$ABSOLUTE_PATH/logo.txt"
+
+# submit the job to Spark
+"$SPARK_SHELL" \
+    --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \
+    --conf spark.kryo.registrator=org.bdgenomics.adam.serialization.ADAMKryoRegistrator \
+    "${SPARK_ARGS[@]}" \
+    --jars ${GNOCCHI_CLI_JAR} \
+    "$@"