# Samples processing<br>


This is a text cell. Start editing!




In [1]:
import org.apache.spark.rdd._
import org.apache.spark.sql._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.functions._
import group.research.aging.spark.extensions._

<div>Samples</div><div>------------<br></div>

In [3]:
val samples = spark.readTSV("/data/samples/species/species.tsv", header=true).na.fill("N/A")
  .where($"index" =!= "N/A")
  .where($"library_strategy" === "RNA-Seq")
  .sort($"organism".desc, $"library_layout")
samples

[bioproject: string, series: string ... 24 more fields]

In [4]:
samples.select("run", "organism", "genes").sort($"organism".desc).show(100000,1000000)

+----------+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|       run|                organism|                                                                                                                                        genes|
+----------+------------------------+---------------------------------------------------------------------------------------------------------------------------------------------+
| SRR636978|        Ursus americanus|                                        /data/samples/species/GSE43013/GSM1055128/SRR636978/GSE43013_GSM1055128_SRR636978_genes_abundance.tsv|
| SRR636888|        Ursus americanus|                                        /data/samples/species/GSE43013/GSM1055038/SRR636888/GSE43013_GSM1055038_SRR636888_genes_abundance.tsv|
| SRR636887|        Ursus americanus|                                    /data/samples/species/PRJNA

In [5]:
samples.writeTSV("/data/samples/samples_partial.csv", sep=",")

parts of /data/samples/samples_partial.csv merged!


/data/samples/samples_partial.csv

In [6]:
def prefix(prefix: String, sufix: String) =  udf[String, String]{ str=> prefix + str + sufix}

<div><b>Extended species file<br></b></div><div>-----------<br></div>

In [8]:
val anage = spark.readTSV("/data/databases/anage/anage_data.tsv", header = true)
 .withColumn("scientific_name", concat($"Genus", lit(" "), $"Species"))
anage.show()

+------+--------+----------+------------+-----------+--------------+-------------+------------+--------------------+----------------------+--------------------+---------------------------+--------------+------------------+-------------------------+--------------------------------+----------------+------------------+----------------+--------------------+-----------------------+------+---------------+-----------+------------+------------+----------+------------------+-------------+---------------+--------------------+--------------------+
|HAGRID| Kingdom|    Phylum|       Class|      Order|        Family|        Genus|     Species|         Common name|Female maturity (days)|Male maturity (days)|Gestation/Incubation (days)|Weaning (days)|Litter/Clutch size|Litters/Clutches per year|Inter-litter/Interbirth interval|Birth weight (g)|Weaning weight (g)|Adult weight (g)|Growth rate (1/days)|Maximum longevity (yrs)|Source|Specimen origin|Sample size|Data quality|IMR (per yr)|MRDT (yrs)|Metabo

<div>Indexed species</div><div>----------------------<br></div>

In [10]:
val indexes = spark.readTSV("/data/indexes/salmon/1.1.0/ensembl_99/salmon_indexes.tsv").as[String].map(_.replace("_", " ")).toDF("scientific_name")
indexes.show()

+--------------------+
|     scientific_name|
+--------------------+
|Acanthochromis po...|
|     Accipiter nisus|
|Ailuropoda melano...|
|    Amazona collaria|
|Amphilophus citri...|
|Amphiprion ocellaris|
|  Amphiprion percula|
|  Anabas testudineus|
|Anas platyrhyncho...|
| Anolis carolinensis|
|Anser brachyrhynchus|
|     Anser cygnoides|
|     Aotus nancymaae|
|     Apteryx haastii|
|      Apteryx owenii|
|        Apteryx rowi|
|Aquila chrysaetos...|
|Astatotilapia cal...|
|  Astyanax mexicanus|
|Astyanax mexicanu...|
+--------------------+
only showing top 20 rows



In [11]:
val indexed_species = indexes.join(anage, "scientific_name")
  .withColumn("url", concat(lit("https://www.ensembl.org/"), $"Genus", lit("_"), $"Species"))
  .sort($"Class", $"Maximum longevity (yrs)".desc)
  .select(
      $"scientific_name".as("scientific_name").as[String], 
      $"Common name".as("common_name").as[String],
      $"url".as[String],
  $"Class".as("class").as[String], 
  $"Maximum longevity (yrs)".as("lifespan").as[String],
  $"Body mass (g)".as("mass_g").as[String],
  $"Metabolic rate (W)".as("metabolic_rate"),
  $"Temperature (K)".as("temperature_k"),
  $"Specimen origin".as("specimen_origin").as[String],
  $"Sample size".as("sample_size").as[String],
  $"Data quality".as("data_quality").as[String]
  )
indexed_species

[scientific_name: string, common_name: string ... 9 more fields]

In [12]:
indexed_species.writeTSV("/data/indexes/salmon/1.1.0/ensembl_99/indexed_species.tsv", true)

parts of /data/indexes/salmon/1.1.0/ensembl_99/indexed_species.tsv merged!


/data/indexes/salmon/1.1.0/ensembl_99/indexed_species.tsv

In [13]:
val samples_partial = indexed_species.join(samples.withColumnRenamed("organism","scientific_name"), "scientific_name").sort("class", "lifespan")
samples_partial.show()

+-------------------+--------------+--------------------+--------+--------+------+--------------+-------------+---------------+-----------+------------+-----------+-----------+----------+-----+--------------------+-------------------+----------------+--------------+------------------+---------+--------------------+--------------------+-----------+-------------+-------+-----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+-------+-------------+--------------------+
|    scientific_name|   common_name|                 url|   class|lifespan|mass_g|metabolic_rate|temperature_k|specimen_origin|sample_size|data_quality| bioproject|     series|       run|taxid|         sample_name|          sequencer|library_strategy|library_layout| library_selection|    study|         study_title|     characteristics|     source|          age|    sex|tumor|            protocol|salmon_version|               index|               genes|    

In [14]:
samples_partial.writeTSV("/data/samples/species_partial.csv", sep=",")

parts of /data/samples/species_partial.csv merged!


/data/samples/species_partial.csv

In [15]:
def by_species =  udf[String, String, String]{ case (species, genes) => 
    "/data/samples/species/by_species/" + species.replace(" ", "_") + "/" + genes.substring(Math.max(0, genes.lastIndexOf("/")+1)) 
    }

In [16]:
val str = samples_partial.select($"scientific_name", $"genes",  by_species($"scientific_name",$"genes").as("new_path")).as[(String, String, String)].collect().map{ case (species,a, b) => 
val p = s"/data/samples/species/by_species/${species.replace(" ", "_")}/"
s"mkdir -p $p" + "\n" + "cp " + a + " " + p}
.toList.mkString("\n")
import ammonite.ops._
write.over(Path("/data/samples/species/copy_samples.sh"), str)

In [17]:
val samples_proper = samples_partial.select($"bioproject",$"series",$"run",
$"sample_name",
$"scientific_name",$"common_name",$"taxid",$"class",$"lifespan",$"mass_g",
$"sequencer",  $"library_strategy",$"library_layout",$"library_selection",
$"study",$"study_title",$"characteristics",$"source",$"age",$"sex",$"tumor",$"protocol",$"salmon_version",
$"index",by_species($"scientific_name",$"genes").as("genes"),
$"libType",
$"modified")
samples_proper.show(10, 1000)

+-----------+-----------+----------+---------------------+-------------------+--------------+-----+-----+--------+------+-------------------+----------------+--------------+------------------+---------+-------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+---------+-------+-----+--------+--------------+--------------------------------+----------------------------------------------------------------------------------------------------------+-------+-----------------------+
| bioproject|     series|       run|          sample_name|    scientific_name|   common_name|taxid|class|lifespan|mass_g|          sequencer|library_strategy|library_layout| library_selection|    study|                    

In [18]:
samples_proper.select("genes").show(10, 10000)

+----------------------------------------------------------------------------------------------------------+
|                                                                                                     genes|
+----------------------------------------------------------------------------------------------------------+
|  /data/samples/species/by_species/Coturnix_japonica/PRJNA296888_SRX1458273_SRR2968903_genes_abundance.tsv|
|  /data/samples/species/by_species/Coturnix_japonica/PRJNA296888_SRX1458274_SRR2968904_genes_abundance.tsv|
|  /data/samples/species/by_species/Coturnix_japonica/PRJNA296888_SRX1458275_SRR2968905_genes_abundance.tsv|
|  /data/samples/species/by_species/Coturnix_japonica/PRJNA296888_SRX1458240_SRR2968870_genes_abundance.tsv|
|/data/samples/species/by_species/Meleagris_gallopavo/PRJNA342653_SRX2164858_SRR4244351_genes_abundance.tsv|
|/data/samples/species/by_species/Meleagris_gallopavo/PRJNA342653_SRX2164835_SRR4244328_genes_abundance.tsv|
|/data/samples/spec

In [19]:
samples_proper.writeParquet("/data/samples/species/samples_index.parquet", true)

parts of /data/samples/species/samples_index.parquet merged!


/data/samples/species/samples_index.parquet

In [20]:
val counts = spark.readTSV("/data/samples/species/SELECT_t___FROM_public_view_ensemble_ana.tsv", header = true)
counts.show(10000,10000)

+--------------------------+------------------------------+------------------+--------------+-------+------+----------+-------------+
|           scientific_name|                   common_name|             class|lifespan_years|studies|  runs|mass_grams|temperature_k|
+--------------------------+------------------------------+------------------+--------------+-------+------+----------+-------------+
|              Mus musculus|                   House mouse|          Mammalia|           4.0|   5637|348537|      18.0|       310.05|
|              Homo sapiens|                         Human|          Mammalia|         122.5|   5452|277578|   70000.0|       310.15|
|               Danio rerio|      Zebra danio or zebrafish|         Teleostei|           5.5|    361| 24879|      null|         null|
|   Drosophila melanogaster|                     Fruit fly|           Insecta|           0.3|    669| 19002|      null|         null|
|         Rattus norvegicus|                    Norway rat|   

In [22]:
val orgs = samples.select($"organism".as("scientific_name")).distinct
orgs.show(100, 1000)

+------------------------+
|         scientific_name|
+------------------------+
|            Homo sapiens|
|        Ursus americanus|
|              Mus caroli|
|            Capra hircus|
|       Macaca nemestrina|
|         Pan troglodytes|
|       Coturnix japonica|
|      Microcebus murinus|
|   Oryctolagus cuniculus|
|     Meleagris gallopavo|
|Ornithorhynchus anatinus|
|            Pan paniscus|
|            Mus musculus|
|              Ovis aries|
|         Cavia porcellus|
|         Gorilla gorilla|
|           Gallus gallus|
|          Equus caballus|
|  Canis lupus familiaris|
|   Monodelphis domestica|
|          Macaca mulatta|
|             Parus major|
|       Rattus norvegicus|
|     Macaca fascicularis|
|      Callithrix jacchus|
|              Sus scrofa|
|    Notamacropus eugenii|
|              Bos taurus|
|   Heterocephalus glaber|
|             Felis catus|
|    Mesocricetus auratus|
+------------------------+



In [23]:
orgs.count()

31

In [25]:
val mams = counts.where($"class" === "Mammalia")
mams.count()

54

In [21]:
val lf = mams.join(orgs, Seq("scientific_name"), "left_anti")
lf.show(10000,1000)

+--------------------------+------------------------------+--------+--------------+-------+----+----------+-------------+
|           scientific_name|                   common_name|   class|lifespan_years|studies|runs|mass_grams|temperature_k|
+--------------------------+------------------------------+--------+--------------+-------+----+----------+-------------+
|        Tursiops truncatus|           Bottlenosed dolphin|Mammalia|          51.6|      5| 397|      null|         null|
|       Chinchilla lanigera|        Long-tailed chinchilla|Mammalia|          17.2|      2|  61|     436.7|       307.85|
|             Bos grunniens|                           Yak|Mammalia|          26.3|     11|  51|      null|       311.15|
|     Meriones unguiculatus|                Mongolian jird|Mammalia|           6.3|      6|  50|      64.8|       311.35|
|    Phascolarctos cinereus|                         Koala|Mammalia|          22.1|      3|  45|    4732.4|       308.95|
|          Tupaia belang

<br>