Merge branch 'master' into trainRFCmd

aehrc · Jan 2, 2024 · 5ebc84f · 5ebc84f
2 parents 2b1f69d + da1c034
commit 5ebc84f
Show file tree

Hide file tree

Showing 188 changed files with 55,402 additions and 3,561 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,70 @@
+# This workflow will build a Java project with Maven
+# For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven
+
+
+#
+# Notes:
+# - we need to install pypandoc first as it is needed by pyspark setup
+# - using python 3.7 because platform specific wheel for pandas 0.23.x required
+#   by hail is not available for python 3.8
+#
+
+name: Verify
+
+on:
+  push:
+    branches:
+      - master
+      - branch-*
+  pull_request:
+    branches:
+      - master
+      - branch-*
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    env:
+      BUILD_TYPE: issue
+    steps:
+      - name: Extract version from branch name (for release branches)
+        if: startsWith(github.event.pull_request.head.ref, 'release/')
+        run: |
+          BRANCH_NAME="${{ github.event.pull_request.head.ref }}"
+          VERSION=${BRANCH_NAME#release/}
+          echo "BUILD_VERSION=$VERSION" >> $GITHUB_ENV
+          echo "BUILD_TYPE=release" >> $GITHUB_ENV
+      - uses: actions/checkout@v2
+      - name: Set up JDK 1.8
+        uses: actions/setup-java@v1
+        with:
+          java-version: 1.8
+      - name: Cache Maven packages
+        uses: actions/cache@v2
+        with:
+          path: ~/.m2
+          key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
+          restore-keys: ${{ runner.os }}-m2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Cache Python packages
+        uses: actions/cache@v2
+        id: pythoncache
+        with:
+          path: /home/runner/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('dev/dev-requirements.txt') }}
+          restore-keys: ${{ runner.os }}-pip
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip cache dir
+          pip install pypandoc==1.5
+          if [ -f dev/dev-requirements.txt ]; then pip install -r dev/dev-requirements.txt; fi
+      - name: Validate Python code
+        run: pylint --max-line-length=140 python/varspark --rcfile python/pylintrc
+      - name: Build with Maven
+        run: mvn -Dbuild.type=${BUILD_TYPE} -B package --file pom.xml
+      - name: Test with pytest
+        run: |
+          dev/py-test.sh
diff --git a/.github/workflows/publish-release.yml b/.github/workflows/publish-release.yml
@@ -0,0 +1,92 @@
+# This workflow will build a Java project with Maven
+# For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven
+
+
+#
+# Notes:
+# - we need to install pypandoc first as it is needed by pyspark setup
+# - using python 3.7 because platform specific wheel for pandas 0.23.x required
+#   by hail is not available for python 3.8
+#
+
+name: Publish Release
+
+on:
+  push:
+    tags:
+      - 'v**'
+jobs:
+  deploy:
+    name: Deploy to Maven Central and PyPI
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up JDK 1.8
+        uses: actions/setup-java@v1
+        with:
+          java-version: 1.8
+      - name: Cache Maven packages
+        uses: actions/cache@v2
+        with:
+          path: ~/.m2
+          key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
+          restore-keys: ${{ runner.os }}-m2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Cache Python packages
+        uses: actions/cache@v2
+        id: pythoncache
+        with:
+          path: /home/runner/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('dev/dev-requirements.txt') }}
+          restore-keys: ${{ runner.os }}-pip
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip cache dir
+          pip install pypandoc==1.5 setuptools wheel twine
+          if [ -f dev/dev-requirements.txt ]; then pip install -r dev/dev-requirements.txt; fi
+      - name: Install GPG key
+        run: |
+          cat <(echo -e "${{ secrets.GPG_KEY }}") | gpg --batch --import
+          gpg --list-secret-keys --keyid-format LONG
+      - name: Configure Maven settings
+        uses: s4u/maven-settings-action@v2.6.0
+        with:
+          servers: |
+            [{
+              "id": "ossrh",
+              "username": "${{ secrets.OSSRH_USERNAME }}",
+              "password": "${{ secrets.OSSRH_PASSWORD }}"
+            }]
+      - name: Build with Maven
+        run: |
+          mvn -Prelease -DskipTests=true -Dgpg.passphrase="${{ secrets.GPG_PASSPHRASE }}" \
+          --batch-mode --file pom.xml \
+          deploy
+      - name: Build and publish to PyPI
+        env:
+          TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+          TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+        run: |
+          cd python
+          python setup.py sdist bdist_wheel
+          twine upload dist/*
+
+  create-release:
+    name: Draft GitHub release
+    runs-on: ubuntu-latest
+    needs: [ deploy ]
+    steps:
+      - name: Create release
+        id: create_release
+        uses: actions/create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ github.ref }}
+          release_name: ${{ github.ref }}
+          draft: true
+          prerelease: true
diff --git a/.gitignore b/.gitignore
@@ -59,3 +59,7 @@ dist
 _build
 spark-warehouse
 .*.crc
+
+
+
+version.py
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -0,0 +1,16 @@
+# .readthedocs.yml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+  configuration: python/docs/conf.py
+
+# Optionally set the version of Python and requirements required to build your docs
+python:
+  version: 3.6
+  install:
+    - requirements: dev/rtd-requirements.txt
diff --git a/.travis.yml b/.travis.yml
diff --git a/LICENSE b/LICENSE
@@ -27,7 +27,7 @@ MIT (https://opensource.org/licenses/mit-license.php)
 
 Joda-Time (http://www.joda.org/joda-time/)
 Copyright 2002-2016 Joda.org. All Rights Reserved.
-Apache License Version 2.0 (http://www.joda.org/joda-time/license.html)
+Apache License Version 2.0 (https://www.joda.org/joda-time/licenses.html)
 
 fastutil (http://fastutil.di.unimi.it/)
 Copyright 2003-2016 Paolo Boldi and Sebastiano Vigna

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Variant Spark
 
-[![Travis-Build](https://travis-ci.org/aehrc/VariantSpark.svg?branch=master)](https://travis-ci.org/aehrc/VariantSpark#)
+[![Build](https://github.com/aehrc/VariantSpark/workflows/Java%20and%20Python%20CI%20with%20Maven/badge.svg)](https://github.com/aehrc/VariantSpark/actions?query=workflow%3CI)
 [![Documentation Status](https://readthedocs.org/projects/variantspark/badge/?version=latest)](http://variantspark.readthedocs.io/en/latest/?badge=latest)
 
 _variant-spark_ is a scalable toolkit for genome-wide association studies optimized for GWAS like datasets.
@@ -39,7 +39,7 @@ In order to build the binaries use:
 
     mvn clean install
 
-For python _variant-spark_ requires python 2.7 with pip.
+For python _variant-spark_ requires python 3.6+ with pip.
 The other packages required for development are listed in `dev/dev-requirements.txt` and can be installed with:
 
     pip install -r dev/dev-requirements.txt
@@ -54,7 +54,7 @@ The complete built including all check can be run with:
 
 ### Running
 
-variant-spark requires an existing spark 2.1+ installation (either a local one or a cluster one).
+variant-spark requires an existing spark 3.1+ installation (either a local one or a cluster one).
 
 To run variant-spark use:
 
@@ -84,11 +84,11 @@ The difference between running in `--local` mode and in `--spark` with `local` m
 Also the output will be written to the location determined by the hadoop filesystem settings. In particular paths without schema e.g. 'output.csv' will be resolved with the hadoop default filesystem (usually HDFS)
 To change this behavior you can set the default filesystem in the command line using `spark.hadoop.fs.default.name` option. For example to use local filesystem as the default use:
 
-    veriant-spaek --spark ... --conf "spark.hadoop.fs.default.name=file:///" ... -- importance  ... -of output.csv
+    ./variant-spark --spark ... --conf "spark.hadoop.fs.default.name=file:///" ... -- importance  ... -of output.csv
 
 You can also use the full URI with the schema to address any filesystem for both input and output files e.g.:
 
-    veriant-spaek --spark ... --conf "spark.hadoop.fs.default.name=file:///" ... -- importance  -if hdfs:///user/data/input.csv ... -of output.csv
+    ./variant-spark --spark ... --conf "spark.hadoop.fs.default.name=file:///" ... -- importance  -if hdfs:///user/data/input.csv ... -of output.csv
 
 ### Running examples
 
@@ -100,7 +100,7 @@ variant-spark comes with a few example scripts in the `scripts` directory that d
 
 There is a few small data sets in the `data` directory suitable for running on a single machine. For example
 
-    ./scripts/local_run-importance-ch22.sh
+    ./examples/local_run-importance-ch22.sh
 
 runs variable importance command on a small sample of the chromosome 22 vcf file (from 1000 Genomes Project)
 
@@ -120,35 +120,30 @@ You can choose a different location by setting the `VS_DATA_DIR` environment var
 
 After the test data has been successfully copied to HDFS you can run examples scripts, e.g.:
 
-    ./scripts/yarn_run-importance-ch22.sh
+    ./examples/yarn_run-importance-ch22.sh
 
 Note: if you installed the data to a non default location the `VS_DATA_DIR` needs to be set accordingly when running the examples
 
-#### Databricks notebook examples
+### VariantSpark on the cloud
 
-For convenience we have also provided a sample end-to-end variant-spark workflow
-in a Databricks (Jupyter-style) notebook for Spark 2.2. The examples, using a synthetic phenotype (Hipster-index)
-can be found in the `notebook-examples` folder of this repository.
+VariantSpark can easily be used in AWS and Azure. For more examples and information, check the [cloud folder](https://github.com/aehrc/VariantSpark/tree/master/cloud). For a quick start, check the few pointers below.
 
-To use an example:
+#### AWS Marketplace
 
-1. **Create** a free, community [Databricks](https://databricks.com/) account
-2. **Download** the `VariantSpark_HipsterIndex_Spark2.2.scala` file. Databricks notebook is for Spark 2.2 (scala 2.11)
-3. **Import** the notebook file into your Databricks instance. Read the instructions in the notebook on how to import a new library to use the `variant-spark` library.
-4. **Start** a cluster (be sure to select the version of Spark and Scala specified in the notebook). Wait up to 5 minutes for the cluster to be ready.
-5. **Attach** the notebook to the cluster
-6. **Run** the sample notebook
+VariantSpark is now available on [AWS Marketplace](https://aws.amazon.com/marketplace/pp/AEHRC-VariantSpark-Notebook/B07YVND4TD). Please read the [Guidlines](contributions/AwsMarketplace/README.md) for specification and step-by-step instructions.
+
+#### Azure Databricks
+
+VariantSpark can be easily deployed in Azure Databricks through the button below. Please read the [VariantSpark azure manual](https://github.com/aehrc/VariantSpark-Azure-deployment) for specification and step-by-step instructions.
+
+[![Deploy to Azure](https://aka.ms/deploytoazurebutton)](https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Faehrc%2FVariantSpark-Azure-deployment%2Fmaster%2Fazuredeploy.json)
 
 ### Contributions
 
 #### JsonRfAnalyser
 
 [JsonRfAnalyser](contributions/JsonRfAnalyser) is a python program that looks into the JSON RandomForest model and list variables on each tree and branch. Please read [README](contributions/JsonRfAnalyser/README.md) to see the complete list of functionalities.
 
-#### VariantSpark on AWS Marketplace
-
-VariantSpark is now available on [AWS Marketplace](https://aws.amazon.com/marketplace/pp/AEHRC-VariantSpark-Notebook/B07YVND4TD). Please read the [Guidlines](contributions/AwsMarketplace/README.md) for specification and step-by-step instructions.
-
 #### WebVisualiser
 
 [rfview.html](contributions/WebVisualiser/rfview.html) is a web program (run locally on your machine) where you can upload the json model produced by variantspark and it visualises trees in the model. You can identify which tree to be visualised. Node color and node labels could be set to different parameters such as number of samples in the node or the node impurity. It uses [vis.js](https://visjs.org/) for tree Visualisation.