In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
    .appName("HipsterIndex") \
    .config("spark.driver.extraClassPath", "/Users/szu004/miniconda2/envs/hail/lib/python3.6/site-packages/hail/hail-all-spark.jar")\
    .config("spark.jars", "/Users/szu004/miniconda2/envs/hail/lib/python3.6/site-packages/hail/hail-all-spark.jar,../target/variant-spark_2.11-0.3.0-SNAPSHOT-all.jar")\
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryo.registrator", "is.hail.kryo.HailKryoRegistrator") \
    .getOrCreate()

In [2]:
spark.sparkContext

In [3]:
import hail as hl
hl.init(sc=spark.sparkContext)

using hail jar at /Users/szu004/miniconda2/envs/hail/lib/python3.6/site-packages/hail/hail-all-spark.jar
Running on Apache Spark version 2.4.1
SparkUI available at http://szu004-mac-dp.nexus.csiro.au:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.16-6da0d3571629
LOGGING: writing to /Users/szu004/dev/variant-spark/dev-notebooks/hail-20190716-1400-0.2.16-6da0d3571629.log


In [4]:
import varspark as vs
vc = vs.VariantsContext(spark)

Running on Apache Spark version 2.4.1
SparkUI available at http://szu004-mac-dp.nexus.csiro.au:4040
Welcome to
 _    __           _             __  _____                  __    
| |  / /___ ______(_)___ _____  / /_/ ___/____  ____ ______/ /__  
| | / / __ `/ ___/ / __ `/ __ \/ __/\__ \/ __ \/ __ `/ ___/ //_/  
| |/ / /_/ / /  / / /_/ / / / / /_ ___/ / /_/ / /_/ / /  / ,<     
|___/\__,_/_/  /_/\__,_/_/ /_/\__//____/ .___/\__,_/_/  /_/|_|    
                                      /_/                         


In [5]:
from hail.plot import show
from pprint import pprint
hl.plot.output_notebook()

In [6]:
data = hl.import_vcf('../data/hipsterIndex/hipster.vcf.bgz')

In [7]:
data.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AA: str, 
        AC: array<int32>, 
        AF: array<float64>, 
        AFR_AF: array<float64>, 
        AMR_AF: array<float64>, 
        AN: int32, 
        CIEND: array<int32>, 
        CIPOS: array<int32>, 
        CS: str, 
        DP: int32, 
        EAS_AF: array<float64>, 
        END: int32, 
        EUR_AF: array<float64>, 
        EX_TARGET: bool, 
        IMPRECISE: bool, 
        MC: array<str>, 
        MEINFO: array<str>, 
        MEND: int32, 
        MLEN: int32, 
        MSTART: int32, 
        MULTI_ALLELIC: bool, 
        NS: int32, 
        SAS_AF: array<float64>, 
        SVLEN: array<int32>, 
        SVTYPE: str, 
        TSD: str, 
      

In [8]:
labels = hl.import_table('../data/hipsterIndex/hipster_labels.txt', delimiter=',', 
                types=dict(label='float64', score='float64')).key_by('samples')

2019-07-16 14:00:56 Hail: INFO: Reading table with no type imputation
  Loading column 'samples' as type 'str' (type not specified)
  Loading column 'score' as type 'float64' (user-specified)
  Loading column 'label' as type 'float64' (user-specified)



In [9]:
labels.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'samples': str 
    'score': float64 
    'label': float64 
----------------------------------------
Key: ['samples']
----------------------------------------


In [10]:
mt = data.annotate_cols(label = labels[data.s])
mt.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'label': struct {
        score: float64, 
        label: float64
    }
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'rsid': str
    'qual': float64
    'filters': set<str>
    'info': struct {
        AA: str, 
        AC: array<int32>, 
        AF: array<float64>, 
        AFR_AF: array<float64>, 
        AMR_AF: array<float64>, 
        AN: int32, 
        CIEND: array<int32>, 
        CIPOS: array<int32>, 
        CS: str, 
        DP: int32, 
        EAS_AF: array<float64>, 
        END: int32, 
        EUR_AF: array<float64>, 
        EX_TARGET: bool, 
        IMPRECISE: bool, 
        MC: array<str>, 
        MEINFO: array<str>, 
        MEND: int32, 
        MLEN: int32, 
        MSTART: int32, 
        MULTI_ALLELIC: bool, 
        NS: int32, 
        SAS_AF: array<float64>, 
 

In [11]:
mt.count()

2019-07-16 14:00:58 Hail: INFO: Coerced almost-sorted dataset


(17010, 2504)

In [12]:
mts = mt._select_all(col_exprs=dict(y=mt.label.label, z=1.0),
                        row_exprs=dict(),
                        col_key=[],
                        entry_exprs=dict(e=mt.GT.n_alt_alleles()))
mts.describe()
mts._mir


----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    'y': float64
    'z': float64
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
----------------------------------------
Entry fields:
    'e': int32
----------------------------------------
Column key: None
Row key: ['locus', 'alleles']
----------------------------------------


<hail.ir.matrix_ir.MatrixRename at 0x11ed51828>

In [13]:
mir = mts._mir


In [14]:
from hail.ir import *
Env.hail()
_jrf_model = Env.jvm().au.csiro.variantspark.hail.methods.RFModel.pyApply(Env.spark_backend('rf')._to_java_ir(mts._mir))

In [15]:
_jrf_model.fitTrees(2000,200)

In [16]:
from hail.table import Table
_jrf_model.oobError()
impTable = Table._from_java(_jrf_model.variableImportance())

2019-07-16 14:13:21 Hail: INFO: Coerced sorted dataset


In [17]:
impTable.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<GRCh37> 
    'alleles': array<str> 
    'importance': float64 
----------------------------------------
Key: ['locus', 'alleles']
----------------------------------------


In [18]:
impTable.show()

locus,alleles,importance
locus<GRCh37>,array<str>,float64
2:109511398,"[""G"",""A""]",0.00726
2:109511454,"[""C"",""A""]",0.0018
2:109511463,"[""G"",""A""]",0.105
2:109511467,"[""GACTC"",""G""]",0.324
2:109511478,"[""C"",""T""]",0.00129
2:109511497,"[""G"",""T""]",0.00197
2:109511525,"[""G"",""GAATT""]",0.0253
2:109511527,"[""A"",""C""]",0.0008
2:109511532,"[""A"",""G""]",0.0
2:109511579,"[""C"",""G""]",0.000818


In [19]:
from typing import *
from hail.expr.expressions import *
from hail.expr.types import *
from hail.typecheck import *

class RandomForestModel(object):

    def fit(n_trees = 1000):
        pass
    
    

@typecheck(y=expr_float64,
           x=expr_float64,
           covariates=sequenceof(expr_float64))
def random_forest_model(y, x, covariates=()):
    print(y)
    print(x)
    print(covariates)
    return RandomForestModel()

@typecheck(model=RandomForestModel,
           pass_through=sequenceof(oneof(str, Expression)))
def random_forest_importance_rows(model, pass_through=()):
    pass

In [20]:
rf_model = random_forest_model(y=mt.label.score,
                    x=mt.GT.n_alt_alleles(),
                    covariates=[1.0])
rf_model.fit()


<Float64Expression of type float64>
<Float64Expression of type float64>
[<Float64Expression of type float64>]


In [31]:
gwas = hl.linear_regression_rows(y=mt.label.score,
                                 x=mt.GT.n_alt_alleles(),
                                 covariates=[1.0],
                                 pass_through=[mt.rsid])

2019-07-16 14:21:24 Hail: INFO: linear_regression_rows: running on 2504 samples for 1 response variable y,
    with input variable x, and 1 additional covariate...


In [33]:
gwas.show()

locus,alleles,rsid,n,sum_x,y_transpose_x,beta,standard_error,t_stat,p_value
locus<GRCh37>,array<str>,str,int32,float64,float64,float64,float64,float64,float64
2:109511398,"[""G"",""A""]","""rs150055772""",2504,4.0,35.0,-0.703,1.62,-0.434,0.664
2:109511454,"[""C"",""A""]","""rs558429529""",2504,1.0,11.5,2.05,3.24,0.633,0.527
2:109511463,"[""G"",""A""]","""rs200762071""",2504,147.0,1300.0,-0.579,0.26,-2.23,0.0261
2:109511467,"[""GACTC"",""G""]","""rs145115545""",2504,547.0,5060.0,-0.207,0.142,-1.45,0.147
2:109511478,"[""C"",""T""]","""rs540842456""",2504,1.0,12.0,2.55,3.24,0.788,0.431
2:109511497,"[""G"",""T""]","""rs552720761""",2504,1.0,10.5,1.05,3.24,0.324,0.746
2:109511525,"[""G"",""GAATT""]","""rs147877575""",2504,13.0,99.5,-1.81,0.899,-2.01,0.0445
2:109511527,"[""A"",""C""]","""rs574502076""",2504,1.0,5.5,-3.95,3.24,-1.22,0.222
2:109511532,"[""A"",""G""]","""rs541552562""",2504,1.0,6.5,-2.95,3.24,-0.912,0.362
2:109511579,"[""C"",""G""]","""rs187935909""",2504,1.0,9.5,0.0481,3.24,0.0149,0.988


In [34]:
gwas = gwas.join(impTable)

In [35]:
gwas.show()

locus,alleles,rsid,n,sum_x,y_transpose_x,beta,standard_error,t_stat,p_value,importance
locus<GRCh37>,array<str>,str,int32,float64,float64,float64,float64,float64,float64,float64
2:109511398,"[""G"",""A""]","""rs150055772""",2504,4.0,35.0,-0.703,1.62,-0.434,0.664,0.00726
2:109511454,"[""C"",""A""]","""rs558429529""",2504,1.0,11.5,2.05,3.24,0.633,0.527,0.0018
2:109511463,"[""G"",""A""]","""rs200762071""",2504,147.0,1300.0,-0.579,0.26,-2.23,0.0261,0.105
2:109511467,"[""GACTC"",""G""]","""rs145115545""",2504,547.0,5060.0,-0.207,0.142,-1.45,0.147,0.324
2:109511478,"[""C"",""T""]","""rs540842456""",2504,1.0,12.0,2.55,3.24,0.788,0.431,0.00129
2:109511497,"[""G"",""T""]","""rs552720761""",2504,1.0,10.5,1.05,3.24,0.324,0.746,0.00197
2:109511525,"[""G"",""GAATT""]","""rs147877575""",2504,13.0,99.5,-1.81,0.899,-2.01,0.0445,0.0253
2:109511527,"[""A"",""C""]","""rs574502076""",2504,1.0,5.5,-3.95,3.24,-1.22,0.222,0.0008
2:109511532,"[""A"",""G""]","""rs541552562""",2504,1.0,6.5,-2.95,3.24,-0.912,0.362,0.0
2:109511579,"[""C"",""G""]","""rs187935909""",2504,1.0,9.5,0.0481,3.24,0.0149,0.988,0.000818


In [37]:
p = hl.plot.scatter(-hail.log10(gwas.p_value),gwas.importance, hover_fields=dict(rs=gwas.rsid, loc=gwas.locus))
show(p)

In [36]:
p = hl.plot.manhattan(gwas.p_value, hover_fields=dict(ri=gwas.rsid))
show(p)