# Quick start

## Download sample data

In [None]:
%%bash
mkdir -p data
cd data
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/NA12878.multichrom.md.bam
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/NA12878.multichrom.md.bam.bai
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/Homo_sapiens_assembly18_chr1_chrM.small.fasta
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/Homo_sapiens_assembly18_chr1_chrM.small.fasta.fai
wget --quiet https://raw.githubusercontent.com/bigdatagenomics/mango/master/mango-pileup/examples/data/alignments.ga4gh.chr17.1-250.json

In [None]:
import os
base_path = f"{os.getcwd()}/data"

bam_path = f'{base_path}/NA12878.multichrom.md.bam'
ref_path=f'{base_path}/Homo_sapiens_assembly18_chr1_chrM.small.fasta'
sample_id= 'NA12878'
table_name = 'reads'

app_name = "sequila"


## Check if your env variables are properly set

Ensure that:

* you use Java 11 in have set JAVA_HOME accordingly,
*  Apache Spark 3.1.2 in SPARK_HOME,
*  SeQuiLa Scala package is in your PYSPARK_SUBMIT_ARGS (i.e. --package parameter)


In [None]:
%%bash
echo "JAVA_HOME is $JAVA_HOME"
echo "SPARK_HOME is $SPARK_HOME"
echo "PYSPARK_SUBMIT_ARGS is $PYSPARK_SUBMIT_ARGS"
java -version
spark-shell --version

## Install pysequila and pandas

In [None]:
!pip install pysequila==$VERSION pandas

## Initialize PySequila Session

In [None]:
from pysequila import SequilaSession
ss = SequilaSession \
    .builder \
    .appName(f'{app_name}') \
    .getOrCreate()


## Create a table over BAM files

In [None]:
ss.sql(f'''CREATE TABLE IF NOT EXISTS {table_name} \
         USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource \
         OPTIONS(path "{bam_path}")''')

## Run a simple select statement

In [None]:
import pandas as pd
pd.options.display.max_columns = None
ss.sql(f'''SELECT sample_id, contig, pos, cigar, seq \
        FROM {table_name} LIMIT 5''').toPandas()

## Calculate pileup

In [None]:
ss.sql(f'''SELECT contig, pos_start, pos_end, ref, coverage, countRef, alts \
      FROM  pileup('{table_name}', '{sample_id}', '{ref_path}', true, true ) LIMIT 10''').toPandas()


In [None]:
ss.stop()