# Samples processing<br>


This is a text cell. Start editing!




In [1]:
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.functions._
import group.research.aging.spark.extensions._

<div>Samples</div><div>------------<br></div>

In [3]:
val samples = spark.readTSV("/data/samples/species/species.tsv", headers=true).na.fill("N/A")
  .where($"index" =!= "N/A")
  .where($"library_strategy" === "RNA-Seq")
  .sort($"organism".desc, $"library_layout")
samples

[bioproject: string, series: string ... 24 more fields]

In [4]:
samples.sort($"organism".desc)

[bioproject: string, series: string ... 24 more fields]

In [5]:
samples.writeTSV("/data/samples/samples_partial.csv", sep=",")

parts of /data/samples/samples_partial.csv merged!


/data/samples/samples_partial.csv

In [6]:
val counts = samples.groupBy("organism").count().orderBy($"count".desc)
counts.show(1000, false)

+------------------------+-----+
|organism                |count|
+------------------------+-----+
|Homo sapiens            |93   |
|Rattus norvegicus       |74   |
|Heterocephalus glaber   |15   |
|Ovis aries              |12   |
|Gorilla gorilla         |10   |
|Callithrix jacchus      |10   |
|Equus caballus          |9    |
|Mus caroli              |8    |
|Oryctolagus cuniculus   |7    |
|Macaca mulatta          |7    |
|Pan troglodytes         |7    |
|Felis catus             |7    |
|Gallus gallus           |6    |
|Sus scrofa              |6    |
|Pan paniscus            |6    |
|Cavia porcellus         |5    |
|Macaca nemestrina       |5    |
|Macaca fascicularis     |5    |
|Mus musculus            |5    |
|Ursus americanus        |5    |
|Mesocricetus auratus    |4    |
|Canis lupus familiaris  |4    |
|Coturnix japonica       |4    |
|Capra hircus            |4    |
|Meleagris gallopavo     |3    |
|Monodelphis domestica   |3    |
|Microcebus murinus      |3    |
|Bos tauru

<div>Species</div><div>-----------<br></div>

In [8]:
val anage = spark.readTSV("/data/databases/anage/anage_data.tsv", headers = true)
 .withColumn("scientific_name", concat($"Genus", lit(" "), $"Species"))
anage.show()

+------+--------+----------+------------+-----------+--------------+-------------+------------+--------------------+----------------------+--------------------+---------------------------+--------------+------------------+-------------------------+--------------------------------+----------------+------------------+----------------+--------------------+-----------------------+------+---------------+-----------+------------+------------+----------+------------------+-------------+---------------+--------------------+--------------------+
|HAGRID| Kingdom|    Phylum|       Class|      Order|        Family|        Genus|     Species|         Common name|Female maturity (days)|Male maturity (days)|Gestation/Incubation (days)|Weaning (days)|Litter/Clutch size|Litters/Clutches per year|Inter-litter/Interbirth interval|Birth weight (g)|Weaning weight (g)|Adult weight (g)|Growth rate (1/days)|Maximum longevity (yrs)|Source|Specimen origin|Sample size|Data quality|IMR (per yr)|MRDT (yrs)|Metabo

<div>Indexed species</div><div>----------------------<br></div>

In [10]:
val indexes = spark.readTSV("/data/indexes/salmon/1.0.0/ensembl_97/salmon_indexes.tsv").as[String].map(_.replace("_", " ")).toDF("scientific_name")
indexes.show()

+--------------------+
|     scientific_name|
+--------------------+
|Acanthochromis po...|
|Ailuropoda melano...|
|Amphilophus citri...|
|Amphiprion ocellaris|
|  Amphiprion percula|
|  Anabas testudineus|
|Anas platyrhyncho...|
| Anolis carolinensis|
|Anser brachyrhynchus|
|     Aotus nancymaae|
|     Apteryx haastii|
|      Apteryx owenii|
|        Apteryx rowi|
|Astatotilapia cal...|
|  Astyanax mexicanus|
|   Bison bison bison|
|  Bos indicus hybrid|
|           Bos mutus|
|          Bos taurus|
|   Bos taurus hybrid|
+--------------------+
only showing top 20 rows



In [11]:
val indexed_species = indexes.join(anage, "scientific_name")
  .withColumn("url", concat(lit("https://www.ensembl.org/"), $"Genus", lit("_"), $"Species"))
  .sort($"Class", $"Maximum longevity (yrs)".desc)
  .select(
      $"scientific_name".as("scientific_name").as[String], 
      $"Common name".as("common_name").as[String],
      $"url".as[String],
  $"Class".as("class").as[String], 
  $"Maximum longevity (yrs)".as("lifespan").as[String],
  $"Body mass (g)".as("mass_g").as[String],
  $"Metabolic rate (W)".as("metabolic_rate"),
  $"Temperature (K)".as("temperature_k"),
  $"Specimen origin".as("specimen_origin").as[String],
  $"Sample size".as("sample_size").as[String],
  $"Data quality".as("data_quality").as[String]
  )
indexed_species

[scientific_name: string, common_name: string ... 9 more fields]

In [12]:
indexed_species.writeTSV("/data/indexes/salmon/1.0.0/ensembl_97/indexed_species.tsv", true)

parts of /data/indexes/salmon/1.0.0/ensembl_97/indexed_species.tsv merged!


/data/indexes/salmon/1.0.0/ensembl_97/indexed_species.tsv

In [13]:
val samples_partial = indexed_species.join(samples.withColumnRenamed("organism","scientific_name"), "scientific_name").sort("class", "lifespan")
samples_partial.show()

+-------------------+--------------+--------------------+--------+--------+------+--------------+-------------+---------------+-----------+------------+-----------+-----------+----------+-----+--------------------+-------------------+----------------+--------------+------------------+---------+--------------------+--------------------+-----------+-------------+-------+-----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+-------+-------------+--------------------+
|    scientific_name|   common_name|                 url|   class|lifespan|mass_g|metabolic_rate|temperature_k|specimen_origin|sample_size|data_quality| bioproject|     series|       run|taxid|         sample_name|          sequencer|library_strategy|library_layout| library_selection|    study|         study_title|     characteristics|     source|          age|    sex|tumor|            protocol|salmon_version|               index|               genes|    

In [14]:
samples_partial.writeTSV("/data/samples/species_partial.csv", sep=",")

parts of /data/samples/species_partial.csv merged!


/data/samples/species_partial.csv