Spark-notebook to work with CRISPR screening data
--------------------------------------------------------

In [ ]:
import org.apache.spark.sql.{DataFrame, Encoders, SparkSession}
import org.apache.spark.sql.types.StructType
import org.bdgenomics.adam.rdd.ADAMContext._
import scala.reflect.runtime.universe._

import org.apache.spark.sql.{DataFrame, Encoders, SparkSession}
import org.apache.spark.sql.types.StructType
import org.bdgenomics.adam.rdd.ADAMContext._
import scala.reflect.runtime.universe._


In [ ]:
val spark = SparkSession
  .builder()
  .appName("annotations")
  .getOrCreate()

def readTSV(path: String, header: Boolean): DataFrame = spark.read.option("sep", "\t").option("header", header).csv(path)
def readTyped[T <: Product](path: String, header: Boolean, delimiter: String = "\t")(implicit tag: TypeTag[T]) = {
  var encoder: StructType = Encoders.product[T].schema
  spark.read.option("sep", delimiter)
   .option("header", header)
   .schema(encoder)
   .csv(path).as[T]
}

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@a7e9b68
readTSV: (path: String, header: Boolean)org.apache.spark.sql.DataFrame
readTyped: [T <: Product](path: String, header: Boolean, delimiter: String)(implicit tag: reflect.runtime.universe.TypeTag[T])org.apache.spark.sql.Dataset[T]


Field descriptions
---------------------

Field	Type	Description

Name	String	Name of the sgRNA as provided by authors

chr	String	Chromosome name

start	Number	Start position of sgRNA on the genome

end	Number	End position of sgRNA on the genome

strand	String	One of '+' (forward) or '-' (reverse strand)

pubmed	String	PubMed ID of related article

score	Number	Score of the sgRNA target gene provided by authors (NA if not provided)

cellline	String	Cell line used for the experiment

screentype	String	Type of screen (positive selection, negative selection)

condition	String	Experiment condition (e.g. 'viability')

hit	String	One of 'true'/'false'

genetargets	String	${ENSEMBL_ID}::${GENE_SYMBOL}

log2fc	Number	Log2 fold change calculated from normalized read counts

scoredist	Array	A number of x/y coordinate pairs roughly representing roughly the experiment score distribution

effect	Number	Effect size of sgRNA. Domain: [-9, 9]

rc_initial	Array	Array of 'initial' read counts (before dropout/without treatment. One element per replicate

rc_final	Array	Array of 'final' read counts (after dropout/treament)


In [ ]:
case class Screening(
                   start: String,
                   end: String,
                   chr: String,
                   strand: String,
                   pubmed: String,
                   cellline: String,
                   condition: String,
                   sequence: String,
                   symbol: String,
                   ensg: String,
                   log2fc: String,
                   rc_initial: String,
                   rc_final: String,
                   effect: String,
                   cas: String,
                   screentype: String
                 )

defined class Screening


In [ ]:
val path = "file:///pipelines/indexes/crispr/GenomeCRISPR_full05112017.csv"
val c = readTyped[Screening](path, true, ",")

path: String = file:///pipelines/indexes/crispr/GenomeCRISPR_full05112017.csv
c: org.apache.spark.sql.Dataset[Screening] = [start: string, end: string ... 14 more fields]


In [ ]:
val byType = c.rdd.map(c=>c.screentype -> c)

byType: org.apache.spark.rdd.RDD[(String, Screening)] = MapPartitionsRDD[74] at map at <console>:84


In [ ]:
byType.countByKey

res72: scala.collection.Map[String,Long] = Map(null -> 489, negative selection -> 37089438, positive selection -> 1383773)


In [ ]:
val byCondition = c.rdd.map(c=>c.condition -> c)

byCondition: org.apache.spark.rdd.RDD[(String, Screening)] = MapPartitionsRDD[80] at map at <console>:84


In [ ]:
byCondition.countByKey

res77: scala.collection.Map[String,Long] = Map(null -> 489, viability after 27 days -> 82449, viability after 6 days -> 82315, viability after 21 days -> 82315, resistance to PLX after 14 days -> 57793, viability after 19 days -> 83463, viability; Vemurafenib treatment -> 97790, viability after 5 days -> 82315, viability after 9 days -> 164630, viability after 18 days -> 246945, viability after 13 days -> 165778, essential genes for PA/LFnDTA toxicity -> 1459, resistance to vemurafenib (GeCKOv1 lentiCRISPRv1) -> 64022, viability (GeCKOv2 library) -> 107966, resistance to vemurafenib (GeCKOv1 lentiGuide) -> 64022, viability after 15 days -> 329394, viability after 8 days -> 82315, viability; Trametinib treatment -> 292292, resistance to PLX-4720 (puromycin) -> 95255, resistance to vemura...

In [ ]:
c.take(20)

res83: Array[Screening] = Array(Screening(50844073,50844096,10,+,26472758,Jiyoye,viability,GCAGCATCCCAACCAGGTGGAGG,A1CF,ENSG00000148584,0.31590732393855947,[260],[244],2,hSpCas9,negative selection), Screening(50814011,50814034,10,-,26472758,Jiyoye,viability,GCGGGAGTGAGAGGACTGGGCGG,A1CF,ENSG00000148584,2.1441409756720797,[17],[59],9,hSpCas9,negative selection), Screening(50836111,50836134,10,+,26472758,Jiyoye,viability,ATGACTCTCATACTCCACGAAGG,A1CF,ENSG00000148584,1.4260344087571892,[75],[153],8,hSpCas9,negative selection), Screening(50836095,50836118,10,-,26472758,Jiyoye,viability,GAGTCATCGAGCAGCTGCCATGG,A1CF,ENSG00000148584,1.5501333353479165,[47],[105],8,hSpCas9,negative selection), Screening(50816234,50816257,10,-,26472758,Jiyoye,viability,AGTCACCCTAGCAAAACCAGTGG,A1CF,ENSG00000148584,...