# Samples processing<br>


This is a text cell. Start editing!




In [1]:
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.functions._
import group.research.aging.spark.extensions._

<div><h2>Samples</h2></div>

In [3]:
val samples = spark.readTSV("/data/samples/species/samples_index.tsv", header=true).na.fill("N/A")
  .where($"index" =!= "N/A")
  .where($"library_strategy" === "RNA-Seq")
  .sort($"organism".desc, $"library_layout")
samples.show(10,1000)

+-----------+-----------+----------+----------------------+-----+-----------+----------------------------+----------------+--------------+-----------------+---------+---------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------+----------------+----+-----+--------+--------------+---------------------------------------+-----------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------+--------------+---

In [4]:
samples.count()

617

In [5]:
val res = spark.readTSV("/data/samples/from_view_ensembl_anage_rna_run.tsv", true).withColumnRenamed("run_accession", "run")
res.show(10,100000)

+----------+---------+-----------------------+------------------------+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+----------------------------+-----------------------------+-----------------+-------------------+--------------------+------+-------------+----------------------------------+--------------------+--------+----------------+--------------+---------------+-----------------------------------------------------+-----------+----------------+---------------+----------+-----------------------------------------------------------------+---------------+---------------------------------------------------------------------+----------------------------------

In [6]:
val res_filtered = res.where($"sample_attribute".rlike("brain|liver|kidney|lung|blood|cortex|heart"))
res_filtered.show(10,1000)

+----------+---------+-------------------------+---------------------+--------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+----------------------------+-----------------------------------------------+-----------------+----------------------------+--------------------+------+-------------+-------------------------------------------------------------------+--------------------+--------+----------------+--------------+---------------+------------------------------------------------------------------------------------------------------------+-----------+----------------+---------------------------------------------------------------------------------------------------------------------

In [7]:
val runs = samples.select("run").distinct().cache()
val anti_samples_filtered = res_filtered.join(runs, Seq("run"), "leftanti").where(!$"sample_attribute".contains("cell culture"))
anti_samples_filtered.writeTSV("/data/samples/anti_filtered.tsv", header = true,rewrite= true)

parts of /data/samples/anti_filtered.tsv merged!


/data/samples/anti_filtered.tsv

In [8]:
anti_samples_filtered.count()

1141

In [9]:
val anti_samples = res.join(runs, Seq("run"), "leftanti").cache().where(!$"sample_attribute".contains("cell culture"))
//anti_samples.writeTSV("/data/anti.tsv", header =true)

In [10]:
anti_samples.writeTSV("/data/samples/anti.tsv", header =true, rewrite = true)

parts of /data/samples/anti.tsv merged!


/data/samples/anti.tsv

In [11]:
anti_samples.select("run","class","common_name","sample_attribute", "experiment_title","total_species_runs_counts","study_title").show(500,10000)

+----------+--------------+------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
(res.count, anti_samples.count, samples.count)

(4397,4083,617)

In [13]:
samples.select("run", "organism", "genes").sort($"organism".desc).show(100000,1000000)

+----------+-------------------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|       run|                 organism|                                                                                                                                        genes|
+----------+-------------------------+---------------------------------------------------------------------------------------------------------------------------------------------+
| SRR960568|   Zonotrichia albicollis|                                    /data/samples/species/PRJNA217789/SRX342293/SRR960568/PRJNA217789_SRX342293_SRR960568_genes_abundance.tsv|
| SRR960571|   Zonotrichia albicollis|                                    /data/samples/species/PRJNA217789/SRX342290/SRR960571/PRJNA217789_SRX342290_SRR960571_genes_abundance.tsv|
| SRR636932|         Ursus americanus|                                        /data/samples/spe

In [14]:
def prefix(prefix: String, sufix: String) =  udf[String, String]{ str=> prefix + str + sufix}

<div><b>Extended species file<br></b></div><div>-----------<br></div>

In [16]:
val anage = spark.readTSV("/data/databases/anage/anage_data.tsv", header = true)
 .withColumn("scientific_name", concat($"Genus", lit(" "), $"Species"))
anage.show()

+------+--------+----------+------------+-----------+--------------+-------------+------------+--------------------+----------------------+--------------------+---------------------------+--------------+------------------+-------------------------+--------------------------------+----------------+------------------+----------------+--------------------+-----------------------+------+---------------+-----------+------------+------------+----------+------------------+-------------+---------------+--------------------+--------------------+
|HAGRID| Kingdom|    Phylum|       Class|      Order|        Family|        Genus|     Species|         Common name|Female maturity (days)|Male maturity (days)|Gestation/Incubation (days)|Weaning (days)|Litter/Clutch size|Litters/Clutches per year|Inter-litter/Interbirth interval|Birth weight (g)|Weaning weight (g)|Adult weight (g)|Growth rate (1/days)|Maximum longevity (yrs)|Source|Specimen origin|Sample size|Data quality|IMR (per yr)|MRDT (yrs)|Metabo

<div>Indexed species</div><div>----------------------<br></div>

In [18]:
val indexes = spark.readTSV("/data/indexes/salmon/1.1.0/ensembl_99/salmon_indexes.tsv").as[String].map(_.replace("_", " ")).toDF("scientific_name")
indexes.show()

+--------------------+
|     scientific_name|
+--------------------+
|Acanthochromis po...|
|     Accipiter nisus|
|Ailuropoda melano...|
|    Amazona collaria|
|Amphilophus citri...|
|Amphiprion ocellaris|
|  Amphiprion percula|
|  Anabas testudineus|
|Anas platyrhyncho...|
| Anolis carolinensis|
|Anser brachyrhynchus|
|     Anser cygnoides|
|     Aotus nancymaae|
|     Apteryx haastii|
|      Apteryx owenii|
|        Apteryx rowi|
|Aquila chrysaetos...|
|Astatotilapia cal...|
|  Astyanax mexicanus|
|Astyanax mexicanu...|
+--------------------+
only showing top 20 rows



In [19]:
val indexed_species = indexes.join(anage, "scientific_name")
  .withColumn("url", concat(lit("https://www.ensembl.org/"), $"Genus", lit("_"), $"Species"))
  .sort($"Class", $"Maximum longevity (yrs)".desc)
  .select(
      $"scientific_name".as("scientific_name").as[String], 
      $"Common name".as("common_name").as[String],
      $"url".as[String],
  $"Class".as("class").as[String], 
  $"Maximum longevity (yrs)".as("lifespan").as[String],
  $"Body mass (g)".as("mass_g").as[String],
  $"Metabolic rate (W)".as("metabolic_rate"),
  $"Temperature (K)".as("temperature_k"),
  $"Specimen origin".as("specimen_origin").as[String],
  $"Sample size".as("sample_size").as[String],
  $"Data quality".as("data_quality").as[String]
  )
indexed_species

[scientific_name: string, common_name: string ... 9 more fields]

In [20]:
indexed_species.writeTSV("/data/indexes/salmon/1.1.0/ensembl_99/indexed_species.tsv", true)

org.apache.spark.sql.AnalysisException: path file:/data/indexes/salmon/1.1.0/ensembl_99/indexed_species.tsv already exists.;

In [21]:
val samples_partial = indexed_species.join(samples.withColumnRenamed("organism","scientific_name"), "scientific_name").sort("class", "lifespan")
samples_partial.show()

+-------------------+--------------+--------------------+-----+--------+------+--------------+-------------+---------------+-----------+------------+-----------+-----------+----------+------+--------------------+--------------------+----------------+--------------+------------------+---------+--------------------+--------------------+------------------+---------+-------+-----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------+-------+-------------+--------------------+
|    scientific_name|   common_name|                 url|class|lifespan|mass_g|metabolic_rate|temperature_k|specimen_origin|sample_size|data_quality| bioproject|     series|       run| taxid|         sample_name|           sequencer|library_strategy|library_layout| library_selection|    study|         study_title|     characteristics|            source|      age|    sex|tumor|            protocol|salmon_version|               index|      

In [22]:
samples_partial.writeTSV("/data/samples/samples_partial.tsv", header = true, rewrite = true)

parts of /data/samples/samples_partial.tsv merged!


/data/samples/samples_partial.tsv

In [23]:
samples_partial.groupBy($"class").agg(countDistinct($"scientific_name").as("species"),countDistinct($"run").as("runs")).orderBy($"runs".desc).show(10,1000)

+--------------+-------+----+
|         class|species|runs|
+--------------+-------+----+
|      Mammalia|     37| 489|
|          Aves|      8|  52|
|     Teleostei|      6|  15|
|      Reptilia|      3|   9|
|Chondrichthyes|      1|   4|
|   Coelacanthi|      1|   2|
+--------------+-------+----+



In [24]:
samples_partial.groupBy($"class",  $"scientific_name", $"common_name").agg(countDistinct($"run")as("runs")).orderBy($"class",$"runs".desc).show(100,1000)

+--------------+-------------------------+--------------------------+----+
|         class|          scientific_name|               common_name|runs|
+--------------+-------------------------+--------------------------+----+
|          Aves|              Parus major|                 Great tit|  13|
|          Aves|            Gallus gallus|            Red junglefowl|  10|
|          Aves|      Cyanistes caeruleus|                  Blue tit|   8|
|          Aves|      Taeniopygia guttata|               Zebra finch|   7|
|          Aves|   Zonotrichia albicollis|    White-throated sparrow|   6|
|          Aves|        Coturnix japonica|            Japanese quail|   4|
|          Aves|      Meleagris gallopavo|               Wild turkey|   3|
|          Aves|      Strigops habroptila|                    Kakapo|   1|
|Chondrichthyes|      Callorhinchus milii|             Elephant fish|   4|
|   Coelacanthi|      Latimeria chalumnae|                Coelacanth|   2|
|      Mammalia|         

In [25]:
def by_species =  udf[String, String, String]{ case (species, genes) => 
    "/data/samples/species/by_species/" + species.replace(" ", "_") + "/" + genes.substring(Math.max(0, genes.lastIndexOf("/")+1)) 
    }

In [26]:
val str = samples_partial.select($"scientific_name", $"genes",  by_species($"scientific_name",$"genes").as("new_path")).as[(String, String, String)].collect().map{ case (species,a, b) => 
val p = s"/data/samples/species/by_species/${species.replace(" ", "_")}/"
s"mkdir -p $p" + "\n" + "cp " + a + " " + p}
.toList.mkString("\n")
import ammonite.ops._
write.over(Path("/data/samples/species/copy_samples.sh"), str)

In [27]:
val samples_proper = samples_partial.select($"bioproject",$"series",$"run",
$"sample_name",
$"scientific_name",$"common_name",$"taxid",$"class",$"lifespan",$"mass_g",
$"sequencer",  $"library_strategy",$"library_layout",$"library_selection",
$"study",$"study_title",$"characteristics",$"source",$"age",$"sex",$"tumor",$"protocol",$"salmon_version",
$"index",by_species($"scientific_name",$"genes").as("genes"),
$"libType",
$"modified")
samples_proper.show(10, 1000)

+-----------+-----------+----------+---------------------+-------------------+--------------+------+-----+--------+------+-------------------+----------------+--------------+------------------+---------+------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+---------+------+-----+---------------------------------------+--------------+--------------------------------+----------------------------------------------------------------------------------------------------------+-------+-----------------------+
| bioproject|     series|       run|          sample_name|    scientific_name|   common_name| taxid|class|lifespan|mass_g|          sequencer|library_strategy|library_layout| libr

In [28]:
samples_proper.select("genes").show(10, 10000)

+----------------------------------------------------------------------------------------------------------+
|                                                                                                     genes|
+----------------------------------------------------------------------------------------------------------+
|  /data/samples/species/by_species/Coturnix_japonica/PRJNA296888_SRX1458273_SRR2968903_genes_abundance.tsv|
|  /data/samples/species/by_species/Coturnix_japonica/PRJNA296888_SRX1458275_SRR2968905_genes_abundance.tsv|
|  /data/samples/species/by_species/Coturnix_japonica/PRJNA296888_SRX1458274_SRR2968904_genes_abundance.tsv|
|  /data/samples/species/by_species/Coturnix_japonica/PRJNA296888_SRX1458240_SRR2968870_genes_abundance.tsv|
|    /data/samples/species/by_species/Taeniopygia_guttata/PRJDB3398_DRX081222_DRR087393_genes_abundance.tsv|
|/data/samples/species/by_species/Meleagris_gallopavo/PRJNA342653_SRX2164867_SRR4244360_genes_abundance.tsv|
|/data/samples/spec

In [29]:
samples_proper.writeParquet("/data/samples/species/samples_index.parquet", true)

org.apache.spark.sql.AnalysisException: path file:/data/samples/species/samples_index.parquet already exists.;

In [30]:
val counts = spark.readTSV("/data/samples/species/SELECT_t___FROM_public_view_ensemble_ana.tsv", header = true)
counts.show(10000,10000)

+--------------------------+------------------------------+------------------+--------------+-------+------+----------+-------------+
|           scientific_name|                   common_name|             class|lifespan_years|studies|  runs|mass_grams|temperature_k|
+--------------------------+------------------------------+------------------+--------------+-------+------+----------+-------------+
|              Mus musculus|                   House mouse|          Mammalia|           4.0|   5637|348537|      18.0|       310.05|
|              Homo sapiens|                         Human|          Mammalia|         122.5|   5452|277578|   70000.0|       310.15|
|               Danio rerio|      Zebra danio or zebrafish|         Teleostei|           5.5|    361| 24879|      null|         null|
|   Drosophila melanogaster|                     Fruit fly|           Insecta|           0.3|    669| 19002|      null|         null|
|         Rattus norvegicus|                    Norway rat|   